def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is None, then this method automatically complement 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If True, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some model, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag False. """ if self.creator is None: return cand_funcs = [] seen_set = set() # Initilize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.using_device(self.data) as user: if user.is_active: self.grad = cuda.ones_like(self.data) else: self.grad = numpy.ones_like(self.data) def add_cand(cand): if cand is not None and cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = tuple(y() for y in func.outputs) # access via weak ref in_data = tuple(x.data for x in func.inputs) out_grad = tuple(y and y.grad for y in outputs) with cuda.using_device(*(in_data + out_grad)): gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) if not retain_grad: for y in outputs: if y is not None and y != self: y.grad = None for x, gx in zip(func.inputs, gxs): x.grad = gx if gx is not None: # skip if gradient does not flow add_cand(x.creator)
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is None, then this method automatically complement 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If True, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some model, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag False. """ if self.creator is None: return cand_funcs = [] seen_set = set() # Initilize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.using_device(self.data) as user: if user.is_active: self.grad = cuda.ones_like(self.data) else: self.grad = numpy.ones_like(self.data) def add_cand(cand): if cand is not None and cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = tuple(y() for y in func.outputs) # access via weak ref in_data = tuple(x.data for x in func.inputs) out_grad = tuple(y and y.grad for y in outputs) func._check_data_type_backward(in_data, out_grad) with cuda.using_device(*(in_data + out_grad)): gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) if not retain_grad: for y in outputs: if y is not None and y != self: y.grad = None for x, gx in zip(func.inputs, gxs): x.grad = gx if gx is not None: # skip if gradient does not flow add_cand(x.creator)
(model.batchsize, model.in_channels, model.insize, model.insize)).astype(numpy.float32) if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) forward_preprocess_times[iteration] = forward_preprocess_timer.milliseconds() forward_timer.start() y = model.forward(x_batch) forward_times[iteration] = forward_timer.milliseconds() backward_preprocess_timer.start() if args.gpu >= 0: y.grad = cuda.ones_like(y.data) else: y.grad = numpy.ones_like(y.data) backward_preprocess_times[iteration] = backward_preprocess_timer.milliseconds() backward_timer.start() y.backward() backward_times[iteration] = backward_timer.milliseconds() iter_times[iteration] = iter_timer.milliseconds() total_timer.stop() print('Forward Preprocess:') print('average-forward-preprocess-pass\t{}\tms'.format(forward_preprocess_times.mean())) print('Forward:')
forward_preprocess_timer.start() x_batch = numpy.random.uniform(-1, 1, (model.batchsize, model.in_channels, model.insize, model.insize)).astype( numpy.float32) if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) forward_preprocess_times[ iteration] = forward_preprocess_timer.milliseconds() forward_timer.start() y = model.forward(x_batch) forward_times[iteration] = forward_timer.milliseconds() backward_preprocess_timer.start() if args.gpu >= 0: y.grad = cuda.ones_like(y.data) else: y.grad = numpy.ones_like(y.data) backward_preprocess_times[ iteration] = backward_preprocess_timer.milliseconds() backward_timer.start() y.backward() backward_times[iteration] = backward_timer.milliseconds() iter_times[iteration] = iter_timer.milliseconds() total_timer.stop() print('Forward Preprocess:') print('average-forward-preprocess-pass\t{}\tms'.format(