def update_core(self): self.setup_workers() self._send_message(('update', None)) with cuda.Device(self._devices[0]): # For reducing memory self._master.cleargrads() optimizer = self.get_optimizer('main') batch = self.get_iterator('main').next() batch = self.converter(batch, self._devices[0]) loss = self._calc_loss(self._master, batch, cleargrads_func=self._master.cleargrads) self._master.cleargrads() loss.backward() # NCCL: reduce grads null_stream = cuda.Stream.null if self.comm is not None: gg = gather_grads(self._master) nccl_data_type = _get_nccl_data_type(gg.dtype) self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl_data_type, nccl.NCCL_SUM, 0, null_stream.ptr) scatter_grads(self._master, gg) del gg optimizer.update() if self.comm is not None: gp = gather_params(self._master) nccl_data_type = _get_nccl_data_type(gp.dtype) self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0, null_stream.ptr)
def update_core(self): """Main Update routine of the custom parallel updater.""" self.setup_workers() self._send_message(("update", None)) with cuda.Device(self._devices[0]): # For reducing memory optimizer = self.get_optimizer("main") batch = self.get_iterator("main").next() x = self.converter(batch, self._devices[0]) loss = self._master(*x) / self.accum_grad loss.backward() loss.unchain_backward() # NCCL: reduce grads null_stream = cuda.Stream.null if self.comm is not None: gg = gather_grads(self._master) self.comm.reduce( gg.data.ptr, gg.data.ptr, gg.size, self.nccl.NCCL_FLOAT, self.nccl.NCCL_SUM, 0, null_stream.ptr, ) scatter_grads(self._master, gg) del gg # update parameters self.forward_count += 1 if self.forward_count != self.accum_grad: return self.forward_count = 0 # check gradient value grad_norm = np.sqrt( sum_sqnorm([p.grad for p in optimizer.target.params(False)])) logging.info("grad norm={}".format(grad_norm)) # update if math.isnan(grad_norm): logging.warning("grad norm is nan. Do not update model.") else: optimizer.update() self._master.cleargrads() if self.comm is not None: gp = gather_params(self._master) self.comm.bcast(gp.data.ptr, gp.size, self.nccl.NCCL_FLOAT, 0, null_stream.ptr)
def update_core(self): self.setup_workers() self._send_message(('update', None)) with cuda.Device(self._devices[0]): from cupy.cuda import nccl # For reducing memory self._master.cleargrads() optimizer = self.get_optimizer('main') batch = self.get_iterator('main').next() x = converter_kaldi(batch[0], self.reader) loss = self._master(x) self._master.cleargrads() loss.backward() loss.unchain_backward() # NCCL: reduce grads null_stream = cuda.Stream.null if self.comm is not None: gg = gather_grads(self._master) self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, null_stream.ptr) scatter_grads(self._master, gg) del gg # check gradient value grad_norm = np.sqrt(self._sum_sqnorm( [p.grad for p in optimizer.target.params(False)])) logging.info('grad norm={}'.format(grad_norm)) # update if math.isnan(grad_norm): logging.warning('grad norm is nan. Do not update model.') else: optimizer.update() if self.comm is not None: gp = gather_params(self._master) self.comm.bcast(gp.data.ptr, gp.size, nccl.NCCL_FLOAT, 0, null_stream.ptr) delete_feat(x)
def test_gather_scatter_grads(self): cupy = cuda.cupy model0 = SimpleNet(dtype=self.dtype) model1 = copy.deepcopy(model0) with testing.assert_warns(DeprecationWarning): model0.to_gpu() with testing.assert_warns(DeprecationWarning): model1.to_gpu() optimizer0 = chainer.optimizers.SGD(lr=1.0) optimizer0.setup(model0) optimizer1 = chainer.optimizers.SGD(lr=1.0) optimizer1.setup(model1) bsize = 8 x = numpy.random.uniform(0, 1, (bsize, 2, 5, 5)).astype(self.dtype) t = numpy.empty(bsize, dtype=numpy.int32) for i in range(bsize): t[i] = i % 2 x = chainer.Variable(chainer.backends.cuda.to_gpu(x)) t = chainer.Variable(chainer.backends.cuda.to_gpu(t)) loss0 = model0(x, t) model0.cleargrads() model1.cleargrads() loss0.backward() gg0 = mpu.gather_grads(model0) mpu.scatter_grads(model1, gg0) cupy.testing.assert_array_equal(model0.conv.W.grad, model1.conv.W.grad) cupy.testing.assert_array_equal(model0.conv.b.grad, model1.conv.b.grad) cupy.testing.assert_array_equal(model0.fc.W.grad, model1.fc.W.grad) cupy.testing.assert_array_equal(model0.fc.b.grad, model1.fc.b.grad) optimizer0.update() optimizer1.update() cupy.testing.assert_array_equal(model0.conv.W.data, model1.conv.W.data) cupy.testing.assert_array_equal(model0.conv.b.data, model1.conv.b.data) cupy.testing.assert_array_equal(model0.fc.W.data, model1.fc.W.data) cupy.testing.assert_array_equal(model0.fc.b.data, model1.fc.b.data)
def test_gather_scatter_grads(self): cupy = cuda.cupy model0 = SimpleNet(dtype=self.dtype) model1 = copy.deepcopy(model0) model0.to_gpu() model1.to_gpu() optimizer0 = chainer.optimizers.SGD(lr=1.0) optimizer0.setup(model0) optimizer1 = chainer.optimizers.SGD(lr=1.0) optimizer1.setup(model1) bsize = 8 x = numpy.random.uniform(0, 1, (bsize, 2, 5, 5)).astype(self.dtype) t = numpy.empty(bsize, dtype=numpy.int32) for i in range(bsize): t[i] = i % 2 x = chainer.Variable(chainer.backends.cuda.to_gpu(x)) t = chainer.Variable(chainer.backends.cuda.to_gpu(t)) loss0 = model0(x, t) model0.cleargrads() model1.cleargrads() loss0.backward() gg0 = mpu.gather_grads(model0) mpu.scatter_grads(model1, gg0) cupy.testing.assert_array_equal(model0.conv.W.grad, model1.conv.W.grad) cupy.testing.assert_array_equal(model0.conv.b.grad, model1.conv.b.grad) cupy.testing.assert_array_equal(model0.fc.W.grad, model1.fc.W.grad) cupy.testing.assert_array_equal(model0.fc.b.grad, model1.fc.b.grad) optimizer0.update() optimizer1.update() cupy.testing.assert_array_equal(model0.conv.W.data, model1.conv.W.data) cupy.testing.assert_array_equal(model0.conv.b.data, model1.conv.b.data) cupy.testing.assert_array_equal(model0.fc.W.data, model1.fc.W.data) cupy.testing.assert_array_equal(model0.fc.b.data, model1.fc.b.data)
def update_core(self): self.setup_workers() self._send_message(('update', None)) with chainer.using_device(self._devices[0]): iterator = self.get_iterator('main') optimizer = self.get_optimizer('main') model = self.model batch = iterator.next() x = self.converter(batch, self._devices[0]) #how to split devices? images = x['image'] viewpoints = x['viewpoint'] if self.start: model.cleargrads() self.start = False xp = model.xp batch_size = len(batch) # For reducing memory model.cleargrads() #------------------------------------------------------------------------------ # Scene encoder #------------------------------------------------------------------------------ representation, query_images, query_viewpoints = encode_scene( images, viewpoints, model, self._devices[0]) #------------------------------------------------------------------------------ # Compute empirical ELBO #------------------------------------------------------------------------------ # Compute distribution parameterws (z_t_param_array, pixel_mean) = model.sample_z_and_x_params_from_posterior( query_images, query_viewpoints, representation) # Compute ELBO (ELBO, bits_per_pixel, negative_log_likelihood, kl_divergence) = estimate_ELBO(xp, query_images, z_t_param_array, pixel_mean, self.pixel_log_sigma, batch_size) #------------------------------------------------------------------------------ # Update parameters #------------------------------------------------------------------------------ loss = -ELBO loss.backward() # if start_training: # g = chainer.computational_graph.build_computational_graph(pixel_mean) # with open(os.path.join(args.snapshot_directory,'cg.dot'), 'w') as o: # o.write(g.dump()) # start_training = False # exit(1) # NCCL: reduce grads null_stream = cuda.Stream.null if self.comm is not None: gg = gather_grads(model) nccl_data_type = _get_nccl_data_type(gg.dtype) self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size, nccl_data_type, nccl.NCCL_SUM, 0, null_stream.ptr) scatter_grads(model, gg) del gg optimizer.update() with chainer.no_backprop_mode(): mean_squared_error = cf.mean_squared_error( query_images, pixel_mean) reporter.report( { 'loss': float(loss.data), 'bits_per_pixel': float(bits_per_pixel.data), 'NLL': float(negative_log_likelihood.data), 'MSE': float(mean_squared_error.data) }, model) if self.comm is not None: gp = gather_params(model) nccl_data_type = _get_nccl_data_type(gp.dtype) self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0, null_stream.ptr)