def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Inputs x0 = inputs[0].data dy = inputs[1].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_dy = inputs[1].grad # Grads of outputs g_dx0 = outputs[0].grad # Compute val = self.forward_func.info.args["val"] if prop_down[0] or prop_down[1]: cv = F.constant(val, x0.shape) if not nn.get_auto_forward(): cv.forward() log_v = F.log(cv.data) if prop_down[0]: if accum[0]: g_x0 += g_dx0 * dy * F.r_pow_scalar(x0, val) * log_v**2.0 else: g_x0.copy_from(g_dx0 * dy * F.r_pow_scalar(x0, val) * log_v**2.0) if prop_down[1]: if accum[1]: g_dy += g_dx0 * F.r_pow_scalar(x0, val) * log_v else: g_dy.copy_from(g_dx0 * F.r_pow_scalar(x0, val) * log_v)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # Inputs x0 = inputs[0].data # logits t0 = inputs[1].data # labels dz = inputs[2].data # grad_input # Outputs dx0 = outputs[0].data dt0 = outputs[1].data # Grads of inputs g_x0 = inputs[0].grad g_t0 = inputs[1].grad g_dz = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dt0 = outputs[1].grad # Computation ## w.r.t. x0 if prop_down[0]: # gradient is the backward of softmax with (g_dx0 * dz) as in-coming gradient si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) si.grad.fill(0.0) so = F.softmax(si, axis) if not nn.get_auto_forward(): so.forward() so.backward(g_dx0 * dz, clear_buffer=False) g_x0_ = si.grad if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) ## w.r.t. t0 is not required ## w.r.t. dz if prop_down[2]: # Instable implementation since using `/ dz` ## g_dz_ = g_dx0 * dx0 / dz ## g_dz_ = F.sum(g_dz_, axis) shape = dz.shape if dz.shape != [] else [1] si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) ti = nn.Variable(t0.shape).apply(data=t0) o = nn.Variable(shape) o.grad.fill(1.0) self.forward_func.backward([si, ti], [o], [False, False]) # Sum g_dx0_i * (y_hat_i - y_i) over i g_dz_ = F.sum(g_dx0 * si.grad, axis) if accum[2]: g_dz += g_dz_ else: g_dz.copy_from(g_dz_)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # To deal with double_backward index error for cuda in windows if axis < 0: axis += inputs[0].ndim # Inputs x0 = inputs[0].data y0 = inputs[1].data dy = inputs[2].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_y0 = inputs[1].grad g_dy = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad # w.r.t. x0 if prop_down[0]: # gradient is the backward of softmax with (g_x0 * -sum_i dy_i) as in-coming gradient neg_sum_dy = -F.sum(dy, axis, True) si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) si.grad.fill(0.0) so = F.softmax(si, axis) if not nn.get_auto_forward(): so.forward() so.backward(g_dx0 * neg_sum_dy, clear_buffer=False) g_x0_ = si.grad if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) # w.r.t. y0 is the grad-depends # w.r.t. dy if prop_down[2]: # gradient is the backward of log_softmax with g_dx0 as in-coming gradient lsi = nn.Variable(x0.shape).apply(data=x0, grad=g_dy, need_grad=True) lso = nn.Variable(x0.shape).apply(data=y0, grad=g_dx0) self.forward_func.backward([lsi], [lso], accum=[accum[2]])
def _create_function(self, f, callback, current_scope): callback.verbose2('Creating function {}: {} --> {}.'.format( f.name, [i.name for i in f.inputs], [i.name for i in f.outputs])) f = callback._apply_generate_function_by_type(f) f = callback._apply_generate_function_by_name(f) inputs = self._create_inputs(f.inputs, callback, current_scope) function_instance = _create_function(inputs, f.proto, self.batch_size) outputs = function_instance(*inputs, n_outputs=len(f.outputs), auto_forward=nn.get_auto_forward()) if not isinstance(outputs, tuple): outputs = (outputs, ) for o, ovar in zip(f.outputs, outputs): o.variable = ovar ovar.name = o.name
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args with_bias = True if len(inputs) == 4 else False base_axis = self.forward_func.info.args["base_axis"] # Inputs x0 = inputs[0].data w0 = inputs[1].data b0 = inputs[2].data if with_bias else None dy = inputs[3].data if with_bias else inputs[2].data # Outputs dx0 = outputs[0].data dw0 = outputs[1].data db0 = outputs[2].data if with_bias else None # Grads of inputs g_x0 = inputs[0].grad g_w0 = inputs[1].grad g_b0 = inputs[2].grad if with_bias else None g_dy = inputs[3].grad if with_bias else inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dw0 = outputs[1].grad g_db0 = outputs[2].grad if with_bias else None # Computation ## w.r.t. x or w.r.t. w if prop_down[0] or prop_down[1]: # we can re-use the backward of the forward with different inputs inp_x = nn.Variable(x0.shape).apply(data=g_dx0, grad=g_x0, need_grad=prop_down[0]) inp_w = nn.Variable(w0.shape).apply(data=g_dw0, grad=g_w0, need_grad=prop_down[1]) out_y = nn.Variable(dy.shape).apply(grad=dy) inputs = [inp_x, inp_w] outputs = [out_y] if with_bias: inp_b = nn.Variable(b0.shape).apply(need_grad=False) inputs += [inp_b] self.forward_func.backward(inputs, outputs, accum) ## w.r.t. b if with_bias and prop_down[2] and not accum[2]: zeros = F.constant(0, b0.shape) if not nn.get_auto_forward(): zeros.forward() g_b0.copy_from(zeros.data) ## w.r.t. dy if (not with_bias and prop_down[2]) or (with_bias and prop_down[3]): accum_dy = accum[3] if with_bias else accum[2] g_dy_ = F.affine(g_dx0, w0, None, base_axis) + \ F.affine(x0, g_dw0, None, base_axis) if with_bias: nshape = [1] * base_axis + list(b0.shape) g_db0 = F.reshape(g_db0, nshape) g_dy_ += g_db0 if accum_dy: g_dy += g_dy_ else: g_dy.copy_from(g_dy_)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args with_bias = True if len(inputs) == 4 else False base_axis = self.forward_func.info.args["base_axis"] pad = self.forward_func.info.args["pad"] stride = self.forward_func.info.args["stride"] dilation = self.forward_func.info.args["dilation"] group = self.forward_func.info.args["group"] channel_last = self.forward_func.info.args["channel_last"] output_padding = self.forward_func.info.args["output_padding"] # Inputs x0 = inputs[0].data w0 = inputs[1].data b0 = inputs[2].data if with_bias else None dy = inputs[3].data if with_bias else inputs[2].data # Outputs dx0 = outputs[0].data dw0 = outputs[1].data db0 = outputs[2].data if with_bias else None # Grads of inputs g_x0 = inputs[0].grad g_w0 = inputs[1].grad g_b0 = inputs[2].grad if with_bias else None g_dy = inputs[3].grad if with_bias else inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dw0 = outputs[1].grad g_db0 = outputs[2].grad if with_bias else None # Computation ## w.r.t. x or w.r.t. w if prop_down[0] or prop_down[1]: # we can re-use the backward of the forward with different inputs inp_x = nn.Variable(x0.shape).apply(data=g_dx0, grad=g_x0, need_grad=prop_down[0]) inp_w = nn.Variable(w0.shape).apply(data=g_dw0, grad=g_w0, need_grad=prop_down[1]) out_y = nn.Variable(dy.shape).apply(grad=dy) inputs = [inp_x, inp_w] outputs = [out_y] if with_bias: inp_b = nn.Variable(b0.shape).apply(need_grad=False) inputs += [inp_b] self.forward_func.backward(inputs, outputs, accum) ## w.r.t. b if with_bias and prop_down[2] and not accum[2]: zeros = F.constant(0, b0.shape) if not nn.get_auto_forward(): zeros.forward() g_b0.copy_from(zeros.data) ## w.r.t. dy if (not with_bias and prop_down[2]) or (with_bias and prop_down[3]): accum_dy = accum[3] if with_bias else accum[2] params = { 'base_axis': base_axis, 'pad': pad, 'stride': stride, 'dilation': dilation, 'output_padding': output_padding, 'group': group, 'channel_last': channel_last } g_dy_ = (F.deconvolution(g_dx0, w0, None, **params) + F.deconvolution(x0, g_dw0, None, **params)) if with_bias: if not channel_last: g_db0 = F.reshape(g_db0, [ 1 if i != base_axis else g_db0.shape[0] for i in range(g_dy.ndim) ]) else: g_db0 = F.reshape(g_db0, [ 1 if i != (g_dy.ndim - 1) else g_db0.shape[0] for i in range(g_dy.ndim) ]) g_dy_ += g_db0 if accum_dy: g_dy += g_dy_ else: g_dy.copy_from(g_dy_)
def add2(ctx, x0, x1, n_outputs=-1, outputs=None): return Add2()(x0, x1, n_outputs=n_outputs, auto_forward=nn.get_auto_forward(), outputs=outputs)