def test_concat_arrays_gpu(self): self.check_concat_arrays(self.int_arrays, device=cuda.Device().id, expected_type=numpy.int64) self.check_concat_arrays(self.float_arrays, device=cuda.Device().id, expected_type=numpy.float64)
def _det_gpu(b): # We do a batched LU decomposition on the GPU to compute # and compute the determinant by multiplying the diagonal. # Change the shape of the array to be size=1 minibatch if necessary. # Also copy the matrix as the elments will be modified in-place. a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.zeros((n_matrices, n), dtype='int32') # Output array # These arrays hold information on the execution success # or if the matrix was singular. info = cuda.cupy.zeros(n_matrices, dtype=numpy.intp) ap = matmul._mat_ptrs(a) _, lda = matmul._get_ld(a) if b.dtype == numpy.float32: cuda.cublas.sgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) elif b.dtype == numpy.float64: cuda.cublas.dgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) else: assert False det = cuda.cupy.prod(a.diagonal(axis1=1, axis2=2), axis=1) # The determinant is equal to the product of the diagonal entries # of `a` where the sign of `a` is flipped depending on whether # the pivot array is equal to its index. rng = cuda.cupy.arange(1, n + 1, dtype='int32') parity = cuda.cupy.sum(p != rng, axis=1) % 2 sign = 1. - 2. * parity.astype(b.dtype, copy=False) return det * sign, info
def test_cupy_array2(self): with cuda.Device(0): x = cuda.to_gpu(self.x) if not self.c_contiguous: x = cuda.cupy.asfortranarray(x) with cuda.Device(1): y = cuda.to_cpu(x) self.assertIsInstance(y, numpy.ndarray) numpy.testing.assert_array_equal(self.x, y)
def check_device_spec_cupy(self, device_spec, expected_device_id): device = backend.get_device(device_spec) assert isinstance(device, backend.GpuDevice) assert isinstance(device.device, cuda.Device) assert device.xp is cuda.cupy assert device.device.id == expected_device_id with backend.using_device(device_spec): # TODO(niboshi): Test the Chainer default device assert cuda.Device() == cuda.Device(expected_device_id)
def test_cupy_array_async3(self): with cuda.Device(0): x = cuda.to_gpu(self.x) if not self.c_contiguous: x = cuda.cupy.asfortranarray(x) with cuda.Device(1): with testing.assert_warns(DeprecationWarning): y = cuda.to_gpu(x, stream=cuda.Stream.null) self.assertIsInstance(y, cuda.ndarray) self.assertIsNot(x, y) # Do copy cuda.cupy.testing.assert_array_equal(x, y)
def to_gpu(self, device=None): super(Parameter, self).to_gpu(device) if self.array is None: if device is None: device = cuda.Device().id self._initial_backend = 'cuda' self._initial_device = device
def run(self): dev = cuda.Device(self.device) dev.use() self.setup() while True: job, data = self.pipe.recv() if job == 'finalize': dev.synchronize() break if job == 'update': # For reducing memory self.model.cleargrads() batch = self.converter(self.iterator.next(), self.device) with self.reporter.scope({}): # pass dummy observation loss = _calc_loss(self.model, batch) self.model.cleargrads() loss.backward() del loss gg = gather_grads(self.model) nccl_data_type = _get_nccl_data_type(gg.dtype) null_stream = cuda.Stream.null self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl_data_type, nccl.NCCL_SUM, 0, null_stream.ptr) del gg self.model.cleargrads() gp = gather_params(self.model) nccl_data_type = _get_nccl_data_type(gp.dtype) self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0, null_stream.ptr) scatter_params(self.model, gp) del gp
def __call__(self, trainer): iteration = trainer.updater.iteration if self.devices is not None: devices = self.devices else: devices = [cuda.get_device_from_id(trainer.updater.get_optimizer('opt_gen').target._device_id) for _ in range(2)] with chainer.using_config('train', False), cuda.Device(devices[0]): self.xp = np if trainer.updater.get_optimizer('opt_gen').target._device_id < 0 else cuda.cupy image = self.xp.array(self.image) predictor = trainer.updater.get_optimizer('opt_gen').target rois, bboxes, objectness_scores = predictor(image[self.xp.newaxis, ...])[:3] if len(rois.shape) > 4: rois = F.reshape(rois, (-1,) + rois.shape[-3:]) bboxes = F.reshape(bboxes, (-1,) + bboxes.shape[-3:]) objectness_scores = F.reshape(objectness_scores, (-1, objectness_scores.shape[-1])) discriminator = trainer.updater.get_optimizer('opt_dis').target class_predictions = discriminator(rois) backprop_visualizations = self.get_backprop_visualization(predictor) feature_visualizations = self.get_feature_maps(predictor) self.render_rois( rois, bboxes, iteration, self.image.copy(), backprop_vis=backprop_visualizations, feature_vis=feature_visualizations, objectness_scores=objectness_scores, class_predictions=class_predictions, )
def _inv_gpu(b): # We do a batched LU decomposition on the GPU to compute the inverse # Change the shape of the array to be size=1 minibatch if necessary # Also copy the matrix as the elments will be modified in-place a = matmul._as_batch_mat(b).copy() n = a.shape[1] n_matrices = len(a) # Pivot array p = cuda.cupy.empty((n, n_matrices), dtype=numpy.int32) # Output array c = cuda.cupy.empty_like(a) # These arrays hold information on the execution success # or if the matrix was singular info = cuda.cupy.empty(n_matrices, dtype=numpy.int32) ap = matmul._mat_ptrs(a) cp = matmul._mat_ptrs(c) _, lda = matmul._get_ld(a) _, ldc = matmul._get_ld(c) handle = cuda.Device().cublas_handle if b.dtype == numpy.float32: cuda.cublas.sgetrfBatched(handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.sgetriBatched(handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) elif b.dtype == numpy.float64: cuda.cublas.dgetrfBatched(handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices) cuda.cublas.dgetriBatched(handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc, info.data.ptr, n_matrices) else: assert False return c, info
def evaluate(self, snapshot_name=''): current_device = cuda.get_device_from_id(self.args.gpu) with current_device: gt_data = [] pred_data = [] for i, batch in enumerate( tqdm(self.data_iterator, total=len(self.data_loader) // self.args.batchsize)): image, gt_bboxes, gt_labels = batch[0] gt_data.append((gt_bboxes, gt_labels)) # if self.args.gpu is not None: # image = cuda.to_gpu(image, current_device) with cuda.Device(self.args.gpu): with configuration.using_config('train', False): bboxes, labels, scores = self.model.predict( image.copy()[None, ...]) if len(bboxes[0]) == 0: bboxes = [np.zeros((1, 4), dtype=np.float32)] labels = [np.zeros((1, ), dtype=np.int32)] scores = [np.zeros((1, ), dtype=np.float32)] pred_data.append((bboxes[0], labels[0], scores[0])) # TODO handle empty predictions!! bboxes, labels, scores = zip(*pred_data) gt_bboxes, gt_labels = concat_examples(gt_data) result = eval_detection_voc(bboxes, labels, scores, gt_bboxes, gt_labels, None) map = result['map'] self.save_eval_results(snapshot_name, map)
def __call__(self, *inputs): images, labels = inputs[:2] with cuda.Device(self.device): _, bboxes = self.link(images) bboxes = cuda.to_cpu(bboxes.data) labels = cuda.to_cpu(labels) xp = cuda.get_array_module(bboxes) bboxes = self.extract_corners(bboxes) bboxes = self.scale_bboxes(bboxes, Size._make(images.shape[-2:])) ious = bbox_iou(bboxes.data.copy(), xp.squeeze(labels))[xp.eye(len(bboxes)).astype(xp.bool)] mean_iou = ious.mean() reporter.report({'mean_iou': mean_iou}) pred_bboxes = [bbox.data[xp.newaxis, ...].astype(xp.int32) for bbox in F.separate(bboxes, axis=0)] pred_scores = xp.ones((len(bboxes), 1)) pred_labels = xp.zeros_like(pred_scores) gt_bboxes = [bbox.data[...] for bbox in F.separate(labels, axis=0)] gt_labels = xp.zeros_like(pred_scores) result = chainercv.evaluations.eval_detection_voc( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels ) reporter.report({'map': result['map']}) reporter.report({'ap/sheep': result['ap'][0]})
def update_core(self): self.setup_workers() self._send_message(('update', None)) with cuda.Device(self._devices[0]): # For reducing memory self._master.cleargrads() optimizer = self.get_optimizer('main') batch = self.get_iterator('main').next() batch = self.converter(batch, self._devices[0]) loss = self._calc_loss(self._master, batch, cleargrads_func=self._master.cleargrads) self._master.cleargrads() loss.backward() # NCCL: reduce grads null_stream = cuda.Stream.null if self.comm is not None: gg = gather_grads(self._master) nccl_data_type = _get_nccl_data_type(gg.dtype) self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl_data_type, nccl.NCCL_SUM, 0, null_stream.ptr) scatter_grads(self._master, gg) del gg optimizer.update() if self.comm is not None: gp = gather_params(self._master) nccl_data_type = _get_nccl_data_type(gp.dtype) self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0, null_stream.ptr)
def __call__(self, images): self.visual_backprop_anchors.clear() with cuda.Device(images.data.device): input_images = self.prepare_images(images.copy() * 255) h = self.feature_extractor(input_images) if self.train_imagenet: return h if images.shape[-2] > 224: h = self.res6(h) if images.shape[-2] > 300: h = self.res7(h) self.visual_backprop_anchors.append(h) h = _global_average_pooling_2d(h) transform_params = self.param_predictor(h) transform_params = rotation_dropout(F.reshape(transform_params, (-1, 2, 3)), ratio=0.0) points = F.spatial_transformer_grid(transform_params, self.out_size) rois = F.spatial_transformer_sampler(images, points) if self.transform_rois_to_grayscale: assert rois.shape[ 1] == 3, "rois are not in RGB, can not convert them to grayscale" b, g, r = F.split_axis(rois, 3, axis=1) rois = 0.299 * r + 0.587 * g + 0.114 * b return rois, points
def __call__(self, **kwargs): image = kwargs.pop('image', None) words = kwargs.pop('words', None) return_predictions = kwargs.pop('return_predictions', False) batch_size, images_per_image, num_channels, height, width = image.shape image = self.xp.reshape(image, (-1, num_channels, height, width)) with cuda.Device(self.device): rois, bboxes = self.localizer.predict(image)[:2] predicted_words, raw_classification_result = self.recognizer.predict(rois, return_raw_classification_result=True) predicted_words = F.reshape(predicted_words, (batch_size, images_per_image) + predicted_words.shape[1:]) raw_classification_result = F.reshape( raw_classification_result, (batch_size, images_per_image) + raw_classification_result.shape[1:] ) best_indices, scores = self.determine_best_prediction_indices(raw_classification_result) chosen_indices = best_indices self.calc_word_accuracy( self.xp.concatenate([predicted_words[i, best_indices[i]].array for i in range(batch_size)], axis=0), words, self.strip_non_alphanumeric_predictions, ) if not self.only_return_best_result: best_indices = self.xp.arange(images_per_image)[None, ...] best_indices = self.xp.tile(best_indices, (batch_size, 1)) predicted_words = self.xp.stack([predicted_words[i, best_indices[i]].array for i in range(batch_size)], axis=0) if return_predictions: rois = F.reshape(rois, (batch_size, images_per_image) + rois.shape[1:]) bboxes = F.reshape(bboxes, (batch_size, images_per_image) + bboxes.shape[1:]) rois = self.xp.stack([rois[i, best_indices[i]].array for i in range(batch_size)], axis=0) bboxes = self.xp.stack([bboxes[i, best_indices[i]].array for i in range(batch_size)], axis=0) return rois, bboxes, predicted_words, best_indices, chosen_indices, scores
def __call__(self, *inputs): images, labels = inputs[:2] with cuda.Device(self.device): rois, bboxes = self.link.predict(images)[:2] self.xp = cuda.get_array_module(bboxes) bboxes = bboxes.data labels = self.ndarray_to_list(labels) batch_size, num_predicted_masks, pred_masks = self.bboxes_to_masks( bboxes, images) pred_masks = self.ndarray_to_list(pred_masks) if self.assessor is not None: pred_scores = self.ndarray_to_list( self.assessor.extract_iou_prediction( self.assessor(rois)).data.reshape( batch_size, num_predicted_masks)) pred_masks, pred_scores = self.perform_nms( batch_size, bboxes, num_predicted_masks, pred_masks, pred_scores) else: pred_scores = self.ndarray_to_list( numpy.ones((batch_size, num_predicted_masks))) ious = self.xp.concatenate(self.calculate_iou(pred_masks, labels)) mean_iou = float(self.xp.sum(ious) / len(ious)) reporter.report({'mean_iou': mean_iou}) result = self.calculate_map(pred_masks, pred_scores, labels) reporter.report({'map': result['map']})
def test_numpy_array_async3(self): with cuda.Device(1): with testing.assert_warns(DeprecationWarning): y = cuda.to_gpu(self.x, stream=cuda.Stream.null) self.assertIsInstance(y, cuda.ndarray) cuda.cupy.testing.assert_array_equal(self.x, y) self.assertEqual(int(y.device), 1)
def get_random_state(): global _random_states dev = cuda.Device() rs = _random_states.get(dev.id, None) if rs is None: rs = DropoutRandomStates(os.getenv('CHAINER_SEED')) _random_states[dev.id] = rs return rs
def test_linear_model_multi_gpu(self): backend_config = backend.BackendConfig({ 'use_cuda': True, 'cuda_device': 1 }) with cuda.Device(0): accuracy = self.model.accuracy(backend_config) self.assertGreater(cuda.to_cpu(accuracy.data), 0.9)
def test_from_array(self, backend_config): with cuda.Device(backend_config.cuda_device): arr = cuda.ndarray((), numpy.float32) # Test precondition check assert arr.device.id == backend_config.cuda_device device = backend.GpuDevice.from_array(arr) assert isinstance(device, backend.GpuDevice) assert (device == chainer.get_device((cuda.cupy, backend_config.cuda_device)))
def test_from_array(self, backend_config): with cuda.Device(backend_config.cuda_device): arr = cuda.ndarray((), numpy.float32) # Test precondition check assert arr.device.id == backend_config.cuda_device device = backend.GpuDevice.from_array(arr) self.check_device(device, backend_config) assert device == backend.GpuDevice.from_device_id( backend_config.cuda_device)
def test_linear_model_multi_gpu(self): backend_config = backend.BackendConfig({ 'use_cuda': True, 'cuda_device': 1 }) skip, msg = self.skip_loss_scaling(backend_config) if skip: return unittest.SkipTest(msg) with cuda.Device(0): accuracy = self.model.accuracy(backend_config) self.assertGreater(cuda.to_cpu(accuracy.data), 0.9)
def test_model_setup_multi_gpu(self): with cuda.Device(0): model = self.model.model optimizer = self.model.optimizer model.to_gpu(1) optimizer.setup(model) # Initialize the optimizer state by running an update for param in optimizer.target.params(False): param.cleargrad() param.update() for v in six.itervalues(param.update_rule.state): self.assertEqual(int(param.data.device), int(v.device))
def test_get_device_from_array(self, backend_config): with cuda.Device(backend_config.cuda_device): arr = cuda.ndarray((), numpy.float32) # Test precondition check assert arr.device.id == backend_config.cuda_device expected_device = backend_config.device device = backend.GpuDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def test_chainerx_cuda_to_cupy_multigpu(self): orig = self.orig_chainerx('cuda:0') converted = self.send_check_equal(orig, '@cupy:1') assert isinstance(converted, cuda.ndarray) assert converted.device.id == 1 # memory must not be shared converted_copy = converted.copy() with cuda.Device(1): converted[:] *= 2 numpy.testing.assert_array_equal( backend.CpuDevice().send(orig), backend.CpuDevice().send(converted_copy))
def _guess_device_from_array_module(xp): """Returns a plausible device from array module .. warning:: There can be multiple devices for a module """ if xp is cuda.cupy: return cuda.GpuDevice(cuda.Device()) elif xp is chainerx: return _chainerx.ChainerxDevice(chainerx.get_default_device()) else: # Cannot detect intel64, because xp of intel64 is numpy. return _cpu.CpuDevice()
def _get_device(device_spec): # Converts device specificer to a chainer.Device instance. # Additionally to chainer.get_device, # this function supports the following conversions: # - None: returns None # - negative integer: returns CpuDevice # - non-negative integer: returns GpuDevice if device_spec is None: return None # For backward compatibilities if isinstance(device_spec, six.integer_types): if device_spec < 0: return backend.CpuDevice() return backend.get_device(cuda.Device(device_spec)) return backend.get_device(device_spec)
def test_model_setup_multi_gpu(self): skip, msg = self.skip_loss_scaling() if skip: return unittest.SkipTest(msg) with cuda.Device(0): model = self.model.model optimizer = self.model.optimizer model.to_gpu(1) optimizer.setup(model) _optimizer_loss_scaling(optimizer, self.loss_scaling) # Initialize the optimizer state by running an update for param in optimizer.target.params(False): param.cleargrad() param.update() for v in six.itervalues(param.update_rule.state): self.assertEqual(int(param.data.device), int(v.device))
def update_core(self): localizer_optimizer = self.get_optimizer('opt_gen') discriminator_optimizer = self.get_optimizer('opt_dis') xp = self.localizer.xp with cuda.Device(self.device): batch = next(self.get_iterator('real')) real_images, labels = self.converter(batch, self.device)[:2] y_real = self.discriminator(real_images) batch = next(self.get_iterator('main')) fake_images = self.converter(batch, self.device) x_fake, bboxes = self.localizer(fake_images) y_fake = self.discriminator(x_fake) localization_labels = xp.full((len(y_fake), 1), self.localizer_target, dtype=xp.float32) loss_localizer = F.mean_squared_error(y_fake, localization_labels) for regularizer in self.regularizers: loss_localizer += regularizer.calc_loss( bboxes, Size._make(fake_images.shape[-2:])) self.discriminator.disable_update() self.localizer.cleargrads() loss_localizer.backward() localizer_optimizer.update() chainer.reporter.report({'loss_localizer': loss_localizer}) self.discriminator.enable_update() x_fake.unchain_backward() bboxes.unchain_backward() loss_dis = F.mean_squared_error(y_real, labels) if not self.freeze_discriminator: self.discriminator.cleargrads() self.localizer.cleargrads() loss_dis.backward() discriminator_optimizer.update() chainer.reporter.report({'loss_dis': loss_dis})
def __call__(self, **kwargs): data = kwargs.pop('data') labels = kwargs.pop('label') with cuda.Device(self.device): data = self.net.xp.array(data) labels = self.net.xp.array(labels) prediction = self.net.predict(data) # part accuracy is the accuracy for each number and accuracy is the accuracy # for the complete vector of numbers part_accuracy, accuracy = self.calc_accuracy(prediction, labels) reporter.report({ "part_accuracy": part_accuracy, "accuracy": accuracy })
def __call__(self, **kwargs): image = kwargs.pop('image', None) words = kwargs.pop('words', None) return_predictions = kwargs.pop('return_predictions', False) with cuda.Device(self.device): rois, bboxes = self.localizer.predict(image)[:2] predicted_words = self.recognizer.predict(rois).array self.xp = cuda.get_array_module(bboxes) batch_size, num_bboxes, num_channels, height, width = rois.shape rois = self.xp.reshape(rois.array, (-1, num_channels, height, width)) bboxes = self.xp.reshape(bboxes.array, (-1, 2, height, width)) self.calc_word_accuracy(predicted_words, words) if return_predictions: return rois, bboxes, predicted_words