def backward(self, target_input_indexes, grad_outputs): retained_inputs = self.get_retained_inputs() inputs = [None] * len(self.inputs) in_data = [None] * len(self.inputs) for retained, i_in in six.moves.zip(retained_inputs, self._input_indexes_to_retain): inputs[i_in] = retained in_data[i_in] = None if retained is None else retained.array in_data = tuple(in_data) grad_out_data = tuple( [None if grad is None else grad.array for grad in grad_outputs]) is_chainerx_fallback_mode = self._is_chainerx_fallback_mode if is_chainerx_fallback_mode: # Convert input and output gradients to numpy/cupy in_data = backend.from_chx(in_data) grad_out_data = backend.from_chx(grad_out_data) # Call Function.backward with chainer.using_device( backend.get_device_from_array(*(in_data + grad_out_data))): if is_chainerx_fallback_mode: # Enable attribute fallback with function_node._chainerx_attribute_fallback( self._function, self.chainerx_device): gxs = self._function.backward(in_data, grad_out_data) else: gxs = self._function.backward(in_data, grad_out_data) # Check gradients for x, gx in six.moves.zip(self.inputs, gxs): if gx is not None: variable._check_grad_type(self, x, True, gx) # Convert input gradients back to ChainerX if is_chainerx_fallback_mode: gxs = backend.to_chx(gxs) ret = [] for i in target_input_indexes: if gxs[i] is None: g = None else: # Intentionally not passing requires_grad=False so that # backprop routines can raise an error when a further backprop # is attempted against this gradient variable. g = variable.Variable(gxs[i]) if g.xp is not chainerx: g.node._old_style_grad_generator = self._function.label ret.append(g) return tuple(ret)
def backward(self, target_input_indexes, grad_outputs): retained_inputs = self.get_retained_inputs() inputs = [None] * len(self.inputs) in_data = [None] * len(self.inputs) for retained, i_in in six.moves.zip( retained_inputs, self._input_indexes_to_retain): inputs[i_in] = retained in_data[i_in] = None if retained is None else retained.array in_data = tuple(in_data) grad_out_data = tuple([None if grad is None else grad.data for grad in grad_outputs]) is_chainerx_fallback_mode = self._is_chainerx_fallback_mode if is_chainerx_fallback_mode: # Convert input and output gradients to numpy/cupy in_data = backend.from_chx(in_data) grad_out_data = backend.from_chx(grad_out_data) # Call Function.backward with cuda.get_device_from_array(*(in_data + grad_out_data)): if is_chainerx_fallback_mode: # Enable attribute fallback with function_node._chainerx_attribute_fallback( self._function, self.chainerx_device): gxs = self._function.backward(in_data, grad_out_data) else: gxs = self._function.backward(in_data, grad_out_data) # Check gradients for x, gx in six.moves.zip(self.inputs, gxs): if gx is not None: variable._check_grad_type(self, x, True, gx) # Convert input gradients back to ChainerX if is_chainerx_fallback_mode: gxs = backend.to_chx(gxs) ret = [] for i in target_input_indexes: if gxs[i] is None: g = None else: # Intentionally not passing requires_grad=False so that # backprop routines can raise an error when a further backprop # is attempted against this gradient variable. g = variable.Variable(gxs[i]) if g.xp is not chainerx: g.node._old_style_grad_generator = self._function.label ret.append(g) return tuple(ret)
def backward(self, target_input_indexes, grad_outputs): in_data = tuple([input.data for input in self.inputs]) grad_out_data = tuple( [None if grad is None else grad.data for grad in grad_outputs]) with cuda.get_device_from_array(*(in_data + grad_out_data)): gxs = self._function.backward(in_data, grad_out_data) for x, gx in six.moves.zip(self.inputs, gxs): variable._check_grad_type(self, x, gx) ret = [] for i in target_input_indexes: if gxs[i] is None: g = None else: # Intentionally not passing requires_grad=False so that # backprop routines can raise an error when a further backprop # is attempted against this gradient variable. g = variable.Variable(gxs[i]) g.node._old_style_grad_generator = self._function.label ret.append(g) return tuple(ret)
def backward(self, target_input_indexes, grad_outputs): in_data = tuple([input.data for input in self.inputs]) grad_out_data = tuple([None if grad is None else grad.data for grad in grad_outputs]) with cuda.get_device_from_array(*(in_data + grad_out_data)): gxs = self._function.backward(in_data, grad_out_data) for x, gx in six.moves.zip(self.inputs, gxs): variable._check_grad_type(self, x, gx) ret = [] for i in target_input_indexes: if gxs[i] is None: g = None else: # Intentionally not passing requires_grad=False so that # backprop routines can raise an error when a further backprop # is attempted against this gradient variable. g = variable.Variable(gxs[i]) g.node._old_style_grad_generator = self._function.label ret.append(g) return tuple(ret)
def forward_grad(self, rho=1e-3, decay=0.50, loss_scale=None): """test """ self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) cur_decay = 1.0 while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = [ i for i, x in enumerate(inputs) if x.requires_grad ] if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) cuda.get_device_from_array(*in_data).use() if hasattr(func, 'with_frad') and func.with_frad: gW, gb = func.forward_grad(in_data, rho) gxs = [None, Variable(gW * cur_decay), Variable(gb * cur_decay)] cur_decay *= decay else: gxs = [None] * len(inputs) if is_debug: for gx in gxs: if gx is None: continue gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on forward-grad computation of ' '{}'.format(func.label)) for i, gx in enumerate(gxs): x = inputs[i] if x.creator_node is not None: add_cand(x.creator_node) if gx is None: continue if not x.requires_grad: continue _check_grad_type(func, x, gx.data) x_var = x.get_variable_or_none() if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale del gxs # to reduce memory usage if initial_device is not None: initial_device.use()