def test_grad_grad_resnet(seed, ctx, auto_forward, inplace, shared): nn.clear_parameters() # Settings nn.set_default_context(ctx) nn.set_auto_forward(auto_forward) b, c, h, w = 4, 3, 32, 32 n_cls = 10 rng = np.random.RandomState(seed) # Network x = nn.Variable.from_numpy_array(rng.randn(b, c, h, w)).apply(need_grad=True) y = SmallResNet(x, inplace=inplace, shared=shared) # Grad of grad dx = nn.grad([y], [x]) ddx = nn.grad([dx[0]], [x]) ddx[0].forward() if not auto_forward else None # Backward of grad x.grad.zero() dx[0].forward() if not auto_forward else None dx[0].backward() # Check between results of var.backward and nn.grad backend = ctx.backend[0].split(":")[0] if backend == 'cuda': pytest.skip( 'CUDA Convolution N-D is only supported in CUDNN extension') assert_allclose(x.g, ddx[0].d, atol=1e-6)
def test_double_backward_floating_variables(): x = nn.Variable((2, 2), need_grad=True) y = nn.Variable((2, 3), need_grad=True) z = nn.Variable((2, 4), need_grad=True) w = F.concatenate(*[x, y, z], axis=-1) o = F.sin(w) dx = nn.grad([o], [x])[0] ddx = nn.grad([dx], [x])[0] # Error must not happen
def rnn_backward(inputs, num_layers=1, nonlinearity='tanh', dropout=None, bidirectional=False, training=True): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ if dropout != 0.0: raise ValueError("Dropout must be 0.0") dys = inputs[0] dhn = inputs[1] xs0 = inputs[2] h0 = inputs[3] w0 = inputs[4] if num_layers == 1: w = None b = inputs[5] if len(inputs) == 6 else None else: w = inputs[5] b = inputs[6] if len(inputs) == 7 else None num_directions = 2 if bidirectional else 1 with_bias = True if b else False ys, hn = _create_fixed_length_rnn(xs0, h0, w0, w, b, num_layers, nonlinearity, num_directions, with_bias) outputs = [ys, hn] grad_outputs = [dys, dhn] if w and b: inputs = [xs0, h0, w0, w, b] dxs0, dh0, dw0, dw, db = nn.grad(outputs, inputs, grad_outputs=grad_outputs) return dxs0, dh0, dw0, dw, db if w and not b: inputs = [xs0, h0, w0, w] dxs0, dh0, dw0, dw = nn.grad(outputs, inputs, grad_outputs=grad_outputs) return dxs0, dh0, dw0, dw if not w and b: inputs = [xs0, h0, w0, b] dxs0, dh0, dw0, db = nn.grad(outputs, inputs, grad_outputs=grad_outputs) return dxs0, dh0, dw0, db if not w and not b: inputs = [xs0, h0, w0] dxs0, dh0, dw0 = nn.grad(outputs, inputs, grad_outputs=grad_outputs) return dxs0, dh0, dw0
def jacobian(self, coordinates): new_coordinates = self.warp_coordinates(coordinates) new_coordinates_x = F.slice(new_coordinates, start=( 0, 0, 0), stop=new_coordinates.shape[:2] + (1,)) grad_x = nn.grad([F.sum(new_coordinates_x)], [coordinates]) new_coordinates_y = F.slice(new_coordinates, start=( 0, 0, 1), stop=new_coordinates.shape[:2] + (2,)) grad_y = nn.grad([F.sum(new_coordinates_y)], [coordinates]) gx = F.reshape(grad_x[0], grad_x[0].shape[:-1] + (1,) + grad_x[0].shape[-1:]) gy = F.reshape(grad_y[0], grad_y[0].shape[:-1] + (1,) + grad_y[0].shape[-1:]) jacobian = F.concatenate(gx, gy, axis=gy.ndim-2) return jacobian
def test_bool_scatter_inplace(seed, ctx, func_name, gshape, mask_shape): from nbla_test_utils import inplace_function_test_helper rng = np.random.RandomState(seed) gdata0 = rng.randn(*gshape).astype(np.float32) mask = rng.randint(0, 2, size=mask_shape) sdata = gdata0[mask.astype(np.bool)] gdata1 = rng.randn(*gshape).astype(np.float32) v_sdata = nn.Variable.from_numpy_array(sdata).apply(need_grad=True) v_mask = nn.Variable.from_numpy_array(mask) v_gdata1 = nn.Variable.from_numpy_array(gdata1).apply(need_grad=True) with nn.auto_forward(): v_gdata2 = F.bool_scatter(v_sdata, v_mask, v_gdata1) # inplace check np.testing.assert_allclose( v_gdata2.d, v_gdata1.d, err_msg="F.bool_scatter(inplace) is not inplaced.") # ref check gdata2 = ref_bool_scatter_inplace(sdata, mask, gdata1) np.testing.assert_allclose(v_gdata2.d, gdata2, err_msg="F.bool_scatter(inplace) fails.") # backward wrt inplaced variable (wrt sdata is checked in not-inplaced case) egrad = rng.randn(*gdata2.shape) mask = mask if mask.shape == gdata1.shape else \ mask.reshape(mask.shape + (1, ) * (gdata1.ndim - mask.ndim)) ref_grad = egrad * (1 - mask) v_gdata1.grad.fill(0) v_gdata2.backward(egrad) np.testing.assert_allclose( v_gdata1.g, ref_grad, err_msg="F.bool_scatter(inplace) backward wrt inplace data fails.") bgrad = rng.randn(*gdata1.shape) v_gdata1.g = bgrad v_gdata2.backward(egrad) np.testing.assert_allclose( v_gdata1.g - bgrad, ref_grad, atol=1e-6, err_msg= "F.bool_scatter(inplace) backward (accum) wrt inplace data fails.") # nn.grad (wrt sdata is checked in not-inplaced case) with nn.auto_forward(): d_gdata1 = nn.grad([v_gdata2], [v_gdata1], grad_outputs=[egrad]) np.testing.assert_allclose( d_gdata1[0].d, ref_grad, atol=1e-6, err_msg="nn.grad (F.bool_scatter(inplace)) wrt inplace data fails.")
def _calc_gradient_penalty(real, fake, discriminator): alpha = F.rand(shape=(1, 1, 1, 1)) interpolates = alpha * real + (1.0 - alpha) * fake interpolates.need_grad = True disc_interpolates = discriminator(x=interpolates) grads = nn.grad([disc_interpolates], [interpolates]) norms = [F.sum(g ** 2.0, axis=1) ** 0.5 for g in grads] return sum([F.mean((norm - 1.0) ** 2.0) for norm in norms])
def test_nn_grad_propagate_down_check(): register("IdentityForwardOnlyFunction", IdentityForwardOnlyFunction_backward) backward_func = registry["IdentityForwardOnlyFunction"] assert backward_func is not None x = nn.Variable.from_numpy_array(np.random.random((1, 1, 32, 32))) y = PF.convolution(x, 1, kernel=(3, 3), pad=(1, 1), with_bias=False) z = IdentityForwardOnlyFunction()(y) w = F.identity(z) # If IdentityForwardOnlyFunction_backward is called in nn.grad, an error will occur. v = nn.grad(w, [z]) v[0].forward()
def test_shared_leaf_variable_basic_arithmetics(seed, ctx, auto_forward): def add(x, derivative=0): if derivative == 0: return x + x + x if derivative == 1: return 3 * np.ones_like(x) if derivative == 2: return np.zeros_like(x) def sub(x, derivative=0): if derivative == 0: return x - x - x if derivative == 1: return -1 * np.ones_like(x) if derivative == 2: return np.zeros_like(x) def mul(x, derivative=0): if derivative == 0: return x * x * x if derivative == 1: return 3 * x**2 if derivative == 2: return 6 * x def div(x, derivative=0): if derivative == 0: return x / x / x if derivative == 1: return -x**-2 if derivative == 2: return 2 * x**-3 # Settings nn.set_default_context(ctx) nn.set_auto_forward(auto_forward) for math_type in [add, sub, mul, div]: xd = np.random.randn(2, 3) + 0.5 x = nn.Variable.from_numpy_array(xd).apply(need_grad=True) x.grad.zero() y = math_type(x) # First-order gradient dy_dx = nn.grad([y], [x]) if not auto_forward: dy_dx[0].forward() assert_allclose(dy_dx[0].d, math_type(xd, 1)) # Second-order gradient dy_dx[0].backward() assert_allclose(x.g, math_type(xd, 2))
def test_compute_simple_hessian(ctx): nn.clear_parameters() # Network state = nn.Variable((1, 2)) output = PF.affine(state, 1, w_init=I.ConstantInitializer(value=1.), b_init=I.ConstantInitializer(value=1.)) loss = F.sum(output**2) # Input state_array = np.array([[1.0, 0.5]]) state.d = state_array # Grad of network params = nn.get_parameters().values() for param in params: param.grad.zero() grads = nn.grad([loss], params) flat_grads = F.concatenate(*[F.reshape(grad, (-1,)) for grad in grads]) if len(grads) > 1 \ else F.reshape(grads[0], (-1,)) # Compute hessian hessian = np.zeros((flat_grads.shape[0], flat_grads.shape[0]), dtype=np.float32) for i in range(flat_grads.shape[0]): flat_grads_i = flat_grads[i] flat_grads_i.forward() for param in params: param.grad.zero() flat_grads_i.backward() num_index = 0 for param in params: grad = param.g.flatten() # grad of grad so this is hessian hessian[i, num_index:num_index + len(grad)] = grad num_index += len(grad) actual = hessian expected = np.array([[ 2 * state_array[0, 0]**2, 2 * state_array[0, 0] * state_array[0, 1], 2 * state_array[0, 0] ], [ 2 * state_array[0, 0] * state_array[0, 1], 2 * state_array[0, 1]**2, 2 * state_array[0, 1] ], [2 * state_array[0, 0], 2 * state_array[0, 1], 2.]]) assert_allclose(actual, expected)
def test_grad_outputs(seed, ctx, auto_forward, type_grad_outputs): from nbla_test_utils import ArrayDiffStats # Settings nn.set_default_context(ctx) nn.set_auto_forward(auto_forward) b, c, h, w = 4, 3, 32, 32 n_cls = 10 rng = np.random.RandomState(seed) x = nn.Variable.from_numpy_array(rng.randn(b, c, h, w)).apply(need_grad=True) y = F.sigmoid(x) # Grad outputs if type_grad_outputs == int: g = rng.randint(-10, 10) elif type_grad_outputs == float: g = rng.randn() elif type_grad_outputs == np.ndarray: g = rng.randn(*y.shape) elif type_grad_outputs == nn.NdArray: g = nn.NdArray.from_numpy_array(rng.randn(*y.shape)) # Zerograd, Forward, Backward on the forward graph inputs = [x] [inp.grad.fill(0) for inp in inputs] if not auto_forward: y.forward() y.backward(g) # Grad inputs = [x] outputs = [y] grad_outputs = [g] grads = nn.grad(outputs, inputs, grad_outputs) if not auto_forward: F.sink(*grads, one_input_grad=1).forward() # Check between results of var.bacwkard and nn.grad for inp, grad in zip(inputs, grads): assert np.allclose(inp.g, grad.d, atol=1e-6), str(ArrayDiffStats(inp.g, grad.d))
def test_multiple_objectives(seed, ctx, auto_forward): from nbla_test_utils import ArrayDiffStats # Settings nn.set_default_context(ctx) nn.set_auto_forward(auto_forward) b, c, h, w = 4, 3, 32, 32 n_cls = 10 rng = np.random.RandomState(seed) # Objecive0 x0 = nn.Variable.from_numpy_array(rng.randn(b, c, h, w)).apply(need_grad=True) y0 = F.sigmoid(x0) # Objecive1 x1 = nn.Variable.from_numpy_array(rng.randn(b, c, h, w)).apply(need_grad=True) y1 = F.tanh(x1) # Zerograd, Forward, Backward on the forward graph g0 = nn.NdArray.from_numpy_array(rng.randn(*x0.shape)) g1 = nn.NdArray.from_numpy_array(rng.randn(*x1.shape)) z = y0 * nn.Variable(g0.shape).apply(data=g0) + y1 * \ nn.Variable(g1.shape).apply(data=g1) inputs = [x0, x1] [inp.grad.fill(0) for inp in inputs] if not auto_forward: z.forward() z.backward() # Grad inputs = [x0, x1] outputs = [y0, y1] grad_outputs = [g0, g1] grads = nn.grad(outputs, inputs, grad_outputs) if not auto_forward: F.sink(*grads, one_input_grad=1).forward() # Check between results of var.bacwkard and nn.grad for inp, grad in zip(inputs, grads): assert np.allclose(inp.g, grad.d, atol=1e-6), str(ArrayDiffStats(inp.g, grad.d))
def gen_path_regularize(fake_img, latents, mean_path_length, decay=0.01, pl_weight=2.0): noise = F.randn(shape=fake_img.shape) / \ np.sqrt(fake_img.shape[2]*fake_img.shape[3]) gradient = nn.grad([F.sum(fake_img * noise)], [latents])[0] path_lengths = F.mean(F.sum(F.pow_scalar(gradient, 2), axis=1), axis=0) path_lengths = F.pow_scalar(path_lengths, 0.5) path_mean = mean_path_length + decay * \ (F.mean(path_lengths) - mean_path_length) path_penalty = F.mean( F.pow_scalar(path_lengths - F.reshape(path_mean, (1, ), inplace=False), 1)) return path_penalty * pl_weight, path_mean, path_lengths
def test_dropout_grad_dependency(p, seed, ctx, func_name): from nnabla._dropout_workaround import _get_dropout_mask # Test whether the memory clearance by grad_depends_on_inputs/outputs does # something bad during graph execution such as the clearance values which # is planned to be used. This test is performed by changing the # inputs/outputs of Dropout to intermediate variables in the same manner of # nbla_test_utils.py. atol_f = 1e-4 with nn.context_scope(ctx): rng = np.random.RandomState(seed) init_x = rng.randn(2, 3, 4).astype(np.float32) * 2 init_dy_for_grad = rng.randn(*init_x.shape).astype(init_x.dtype) init_dx = rng.randn(*init_x.shape).astype(init_x.dtype) init_for_dx2 = rng.randn(*init_x.shape).astype(init_x.dtype) # Graph construction x = nn.Variable.from_numpy_array(init_x).apply(need_grad=True) x_interm = F.identity(x) y_interm = F.dropout(x_interm, p, seed) y = F.identity(y_interm) dx_interm = nn.grad(y, x, grad_outputs=[init_dy_for_grad])[0] dx = F.identity(dx_interm) y_dx = y + dx # replaceable with F.sink(y, dx, one_input_grad=False) # Execution x.g = init_dx # Accumulation y_dx.forward(clear_no_need_grad=True) mask = _get_dropout_mask(x_interm).d # Store mask before the clear y_dx.backward(init_for_dx2, clear_buffer=True) # Reference ref_dx = ref_dropout_double_backward(init_for_dx2, mask, p) + init_dx # Test assert_allclose(x.g, ref_dx, atol=atol_f, err_msg="Wrong output values of double backward of " "Dropout by nn.grad.")
def test_resnet_expansion(seed, ctx, auto_forward, flag_grad_outputs): from nbla_test_utils import ArrayDiffStats nn.clear_parameters() # Settings nn.set_default_context(ctx) nn.set_auto_forward(auto_forward) b, c, h, w = 4, 3, 32, 32 n_cls = 10 rng = np.random.RandomState(seed) # Network x = nn.Variable.from_numpy_array(rng.randn(b, c, h, w)) y = nn.Variable.from_numpy_array(rng.randint(0, n_cls, b).reshape(b, 1)) p = SmallResNet(x) loss = F.mean(F.softmax_cross_entropy(p, y)) # Zerograd, Forward, Backward on the forward graph inputs = nn.get_parameters().values() [inp.grad.fill(0) for inp in inputs] grad = nn.NdArray.from_numpy_array(np.asarray( rng.randn())) if flag_grad_outputs else 1 if not auto_forward: loss.forward() loss.backward(grad) # Grad grad_outputs = grad if flag_grad_outputs else None grads = nn.grad([loss], inputs, [grad_outputs]) if not auto_forward: F.sink(*grads, one_input_grad=1).forward() # Check between results of var.bacwkard and nn.grad backend = ctx.backend[0].split(":")[0] if backend == 'cuda': pytest.skip( 'CUDA Convolution N-D is only supported in CUDNN extension') for inp, grad in zip(inputs, grads): assert np.allclose(inp.g, grad.d, atol=1e-6), str(ArrayDiffStats(inp.g, grad.d))
def test_dropout_double_backward(p, seed, ctx, func_name): from nbla_test_utils import cap_ignore_region, backward_function_tester rng = np.random.RandomState(seed) inpd = cap_ignore_region( rng.randn(2, 3, 4).astype(np.float32) * 2, (-1e-3, 1e-3)) # Ensure there is no zero. inp = nn.Variable.from_numpy_array(inpd).apply(need_grad=True) # ONLY test the double backward with nn.context_scope(ctx): dout = F.dropout(inp, p, seed) out = F.sigmoid(dout) # Check gradient w.r.t. dy only since no backward w.r.t. x grads = nn.grad([out], [inp]) grad = grads[0] grad.forward() grad.backward(1.0, clear_buffer=False) g_dy = grad.parent.inputs[1].g scale = 1. / (1. - p) mask = dout.d != 0 assert np.allclose(g_dy, mask * scale)
def backward_function_tester(rng, func, inputs=None, func_args=[], func_kwargs={}, atol_f=1e-4, atol_b=1e-3, atol_accum=5e-2, dstep=1e-3, backward=None, backward_b=None, ctx=None, non_accum_check=False, skip_backward_check=False, insert_identity=[], auto_forward=False): """ Automatic testing of backward function and backward pass of `func` by comparing it. The backward pass of `func` is the reference; therefore, the backward pass of `func` must be tested first! Syntax of `ref_func`: inputs, parameters """ if ctx is None: ctx = nn.Context() if backward is None: backward = [True for _ in inputs] def create_variables(inputs, backward): vinputs = [] for i, b in zip(inputs, backward): if i is None: vinputs += [None] continue vinp = nn.Variable(i.shape, need_grad=b) vinp.grad.zero() # grads always not accumulation vinputs += [vinp] vinputs[-1].data.cast(i.dtype)[...] = i return vinputs vinputs = create_variables(inputs, backward) vinputs_for_clear_buffer = create_variables(inputs, backward) vinputs_for_nn_grad = create_variables(inputs, backward) vinputs_identity = [] vinputs_identity_for_clear_buffer = [] vinputs_identity_for_nn_grad = [] if not insert_identity: insert_identity = [True] * len(vinputs) for idx, i in enumerate( zip(vinputs, vinputs_for_clear_buffer, vinputs_for_nn_grad)): with nn.auto_forward(auto_forward): i0, i1, i2 = i if i0 is None: vinputs_identity += [None] vinputs_identity_for_clear_buffer += [None] vinputs_identity_for_nn_grad += [None] elif insert_identity[idx]: vinputs_identity += [F.identity(i0)] vinputs_identity_for_clear_buffer += [F.identity(i1)] vinputs_identity_for_nn_grad += [F.identity(i2)] else: vinputs_identity += [i0] vinputs_identity_for_clear_buffer += [i1] vinputs_identity_for_nn_grad += [i2] # Forward and backward of the forward function with no buffer clear with nn.context_scope(ctx), nn.auto_forward(auto_forward): outputs0 = func(*(vinputs_identity + func_args), **func_kwargs) outputs0 = force_list(outputs0) F.sink(*outputs0).forward(clear_no_need_grad=False) grad_voutputs = [] for output in outputs0: ograd = rng.randn(*output.shape) grad_voutputs.append( nn.Variable.from_numpy_array(ograd).apply(need_grad=True)) output.g = ograd F.sink(*outputs0, one_input_grad=False).backward() vinputs = list(filter(lambda x: x is not None, vinputs)) vinputs_identity = list(filter(lambda x: x is not None, vinputs_identity)) vinputs_for_clear_buffer = list( filter(lambda x: x is not None, vinputs_for_clear_buffer)) grad_inputs0 = [inp.g.copy() for inp in vinputs] # Forward and backward of the forward function with clear redundant buffer with nn.context_scope(ctx), nn.auto_forward(auto_forward): outputs_for_clear_buffer = func( *(vinputs_identity_for_clear_buffer + func_args), **func_kwargs) outputs_for_clear_buffer = force_list(outputs_for_clear_buffer) outputs_for_clear_buffer = list( map(lambda x: F.identity(x) if x is not None else None, outputs_for_clear_buffer)) F.sink(*outputs_for_clear_buffer).forward(clear_no_need_grad=True) for o, ref_o in zip(outputs_for_clear_buffer, outputs0): o.g = ref_o.g # Check backward F.sink(*outputs_for_clear_buffer, one_input_grad=False).backward(clear_buffer=True) grad_inputs_for_clear_buffer = [ inp.g.copy() for inp in vinputs_for_clear_buffer ] for grad_ref, grad_res in zip(grad_inputs0, grad_inputs_for_clear_buffer): if grad_ref is None or grad_res is None: continue assert_allclose( grad_ref, grad_res, atol=atol_f, err_msg= "backward(clear_buffer=True) and backward(clear_buffer=False) results differ." ) # Forward of the backward function from nnabla.backward_functions import registry func_name = output.parent.info.type_name func_backward = registry[func_name] grad_vinputs = grad_voutputs + vinputs grad_vinputs_identity = grad_voutputs + vinputs_identity func_info_args = output.parent.info.args with nn.context_scope(ctx), nn.auto_forward(auto_forward): ograds0 = func_backward(grad_vinputs_identity, **func_info_args) ograds0 = force_list(ograds0) ograds0_ = list(filter(lambda o: o is not None, ograds0)) F.sink(*ograds0_).forward(clear_no_need_grad=True) outputs1 = [] for i, ograd in enumerate(ograds0): outputs1.append(ograd.d.copy()) if ograd is not None else \ outputs1.append(None) # Check num of returned elements assert_allclose( len(vinputs), len(outputs1), err_msg="Length of the outputs ({}) does not match " "the length of the inputs ({}) to the backward function".format( len(outputs1), len(vinputs))) # Check forward for i, elm in enumerate(zip(grad_inputs0, outputs1)): grad_ref, grad_res = elm if grad_ref is None or grad_res is None: continue assert_allclose( grad_ref, grad_res, atol=atol_f, err_msg= "Forward of the backward function ({}) fails at {}-th output.". format(func_backward.__name__, i)) # Check the same results between backward_function and nn.grad vinputs = [v for b, v in zip(backward, vinputs) if b] vinputs = list(filter(lambda x: x is not None, vinputs)) with nn.context_scope(ctx), nn.auto_forward(auto_forward): outputs0_for_nn_grad = func( *(vinputs_identity_for_nn_grad + func_args), **func_kwargs) outputs0_for_nn_grad = force_list(outputs0_for_nn_grad) vinputs_identity_for_nn_grad = [ v for b, v in zip(backward, vinputs_identity_for_nn_grad) if b ] vinputs_identity_for_nn_grad = list( filter(lambda x: x is not None, vinputs_identity_for_nn_grad)) ograds1 = nn.grad(outputs0_for_nn_grad, vinputs_identity_for_nn_grad, grad_outputs=[g.d.copy() for g in grad_voutputs]) F.sink(*ograds1).forward(clear_no_need_grad=True) ograds0 = list(filter(lambda o: o is not None, ograds0)) ograds1 = list(filter(lambda o: o is not None, ograds1)) for i in range(len(ograds0)): if ograds0[i].parent is None: continue assert_allclose(ograds0[i].d, ograds1[i].d, atol=atol_f, err_msg="nn.grad and backward_functon results differ.") # Check backward # needed since we sometimes do need_grad=False for optimization, e.g., mask. def set_inputs(inputs0, vinputs): begin = 0 for i in vinputs: end = begin + i.size i.d = inputs0[begin:end].reshape(i.shape) begin = end def obj_func(inputs0, voutput, vinputs): set_inputs(inputs0, vinputs) voutput.forward() y = voutput.d.copy() return y initial_grads = [] for grad_vinput in grad_vinputs: if grad_vinput is None: continue g = np.asarray(rng.randn(*grad_vinput.shape)) initial_grads.append(g) grad_inputs1 = np.concatenate( [v.d.flatten() for v in grad_vinputs if v is not None]) for i, ograd in enumerate(ograds0): # We can skip if the backward is the functions composite. # If the backward is of functions composite, # the numerical difference is really different from the analytical one for some functions. if skip_backward_check: continue if ograd is None or not backward[i]: continue for ig, v in zip(initial_grads, grad_vinputs): v.g = ig # This must be first since approx_fprime destroys the input values # analytical grad. rgrad = rng.randn() with nn.auto_forward(auto_forward): sum_ograd = F.sum(ograd) * rgrad sum_ograd.forward(clear_no_need_grad=True) sum_ograd.backward() analytical_grads = np.concatenate( [v.g.flatten() for v in grad_vinputs]) analytical_grads -= np.concatenate( [g.flatten() for g in initial_grads]) # numerical grad from scipy.optimize import approx_fprime numerical_grads = approx_fprime(grad_inputs1, obj_func, dstep, sum_ograd, grad_vinputs) # grad_vinputs: dy_1, ..., dy_n, x_1, ..., x_n # grad_voutputs: dy_1, ..., dy_n seps = [0] + np.cumsum([int(np.prod(v.shape)) for v in grad_vinputs]).tolist() ngrads = len(grad_voutputs) ninputs = len(grad_vinputs) backward_b = [True] * ninputs if backward_b is None else backward_b for k, sep in enumerate(zip(seps[:-1], seps[1:])): if k >= ngrads and not backward[k - ngrads] or not backward_b[k]: continue s0, s1 = sep analytical_grad = analytical_grads[s0:s1] numerical_grad = numerical_grads[s0:s1] assert_allclose( analytical_grad, numerical_grad, atol=atol_accum, err_msg= "Backward (accum) of the backward function ({}) wrt {}-th / {} input fails." .format(func_backward.__name__, k, ninputs)) # Some functions backward like AffineDataGrad and AffineFilterGrad does not check non-accum anywhere # so check those non-accum backward method here. if non_accum_check: # for any outputs, parents are the same function. parent = outputs0[0].parent inputs = parent.inputs # Accum initial_grads = np.concatenate( [inp.g.flatten() for inp, b in zip(inputs, backward) if b]) accum = [True] * len(inputs) parent.backward(inputs, outputs0, accum=accum) accum_grads = np.concatenate( [inp.g.flatten() for inp, b in zip(inputs, backward) if b]) non_accum_grads0 = accum_grads - initial_grads # Non-accum accum = [False] * len(inputs) parent.backward(inputs, outputs0, accum=accum) non_accum_grads1 = np.concatenate( [inp.g.flatten() for inp, b in zip(inputs, backward) if b]) # Check assert_allclose( non_accum_grads0, non_accum_grads1, atol=atol_b, err_msg="Backward (non-accum) of the backward function ({}) fails." .format(func_backward.__name__))
def main(args): from network import implicit_network # Setting # nn.set_auto_forward(True) ctx = get_extension_context('cudnn', device_id=args.device_id) nn.set_default_context(ctx) D = args.depth L = args.layers W = args.width H = args.height R = H * W z_orientation = 1 # Camera parameters camera = Camera(image_width=W, image_height=H, z_orientation=z_orientation) camloc = np.array([0.75, 0.5, 1]) camloc = (camloc / np.sum(camloc**2)**0.5) * 2 to = np.array([0, 0, 0]) Rt_inv = look_at(camloc, to, z_orientation=z_orientation) R_inv = Rt_inv[:3, :3] fov = 90 K_inv = camera.compute_intrinsic_inv(fov) # Rays x, y = np.meshgrid(np.arange(W), np.arange(H), indexing="xy") xy = np.asarray([x.flatten(), y.flatten()]) xy1 = np.concatenate([xy, np.ones(R)[np.newaxis, :]]) raydir = R_inv.dot(K_inv.dot(xy1)) raydir = raydir / np.sum(raydir**2, axis=0)**0.5 raydir = raydir.transpose((1, 0)) # Network camloc = nn.Variable.from_numpy_array(camloc[np.newaxis, ...]) raydir = nn.Variable.from_numpy_array(raydir[np.newaxis, ...]) sdf_net = partial(implicit_network, D=D, L=L, initial_sphere_radius=args.initial_sphere_radius) sdf_net0 = sdf_net def sdf_net0(x): out = sdf_net(x) sdf = out[..., 0][..., np.newaxis] return sdf # Sphere trace t_near = args.t_near t_far = args.t_far sphere_trace_itr = args.sphere_trace_itr ray_march_points = args.ray_march_points n_chunks = args.n_chunks max_post_itr = args.max_post_itr post_method = args.post_method eps = args.eps st = time.time() x_hit, mask_hit, dists, _, _ = ray_trace(sdf_net0, camloc, raydir, test=True, t_near=t_near, t_far=t_far, sphere_trace_itr=sphere_trace_itr, ray_march_points=ray_march_points, n_chunks=n_chunks, max_post_itr=max_post_itr, post_method=post_method, eps=eps) x_hit.need_grad = False dists.need_grad = False mask_hit.need_grad = False x_curr = x_hit F.sink(*[x_curr, mask_hit]).forward(clear_buffer=False) # Lighting x_curr = x_curr.get_unlinked_variable(need_grad=True) sdf = sdf_net0(x_curr) normal = nn.grad([sdf], [x_curr])[0] normal = F.norm_normalization(normal, axes=normal.ndim - 1, eps=1e-24) dlight = DistantLight() cos = lambert(normal, dlight.direction.reshape([3, 1])).reshape((1, H, W)) mask_hit = mask_hit.get_unlinked_variable(need_grad=False) mask_hit = F.reshape(mask_hit, (1, H, W)) mask_hit = F.broadcast(mask_hit, (3, H, W)) image = mask_hit * 255.0 * cos image.forward(clear_buffer=True) cv2.imwrite( f"sphere_{W}x{H}_sti{sphere_trace_itr:03d}_mpi{max_post_itr:03d}_{args.post_method}.png", image.d.transpose(1, 2, 0)) print( f"Bidirectional sphere trace/ray march (W={W}, H={H}): {time.time() - st} [s]" )
def backward_function_tester(rng, func, ref_func, inputs, func_args=[], func_kwargs={}, atol_f=1e-6, atol_b=1e-3, atol_accum=1e-3, dstep=1e-3, backward=None, ctx=None, func_name=None, ref_grad=None, disable_half_test=False, atol_half=1e-1): """Backward function tester In the forward test, it compares the results of nn.grad and `func`.backward. In the backward test, it compares the analytical gradients and numerical gradient with `grad_outputs`. """ # TODO: half from scipy.optimize import approx_fprime if ctx is None: ctx = nn.Context() if backward is None: backward = [True if i is not None else False for i in inputs] # TODO: Remove set_default_context after adding ctx to BackwardFunction. nn.set_default_context(ctx) # Create Variables def create_variables(inputs, backward): vinputs = [] for i, b in zip(inputs, backward): if i is None: vinputs += [None] continue vinputs += [nn.Variable(i.shape, need_grad=b)] vinputs[-1].data.cast(i.dtype)[...] = i return vinputs # Create grad_outputs def create_grad_outputs(outputs): grad_outputs = [] for o in outputs: if o.shape == (): go = nn.NdArray.from_numpy_array(np.array(randn(rng))) #go = nn.NdArray.from_numpy_array(np.array(1.0)) else: go = nn.NdArray.from_numpy_array(randn(rng, *o.shape)) #go = nn.NdArray.from_numpy_array(np.ones(o.shape)) grad_outputs.append(go) return grad_outputs # Fill grads def fill_grads(vinputs, grads): for vi, gd in zip(vinputs, grads): if vi is None: continue vi.g = gd # Fill grads def zero_grads(vinputs): for vi in vinputs: if vi is None: continue vi.grad.zero() return # Gradient penalty on grads def gradient_penalty2(grads): gp2 = 0.0 for g in grads: gp2 += F.sum(g**2.0) return gp2 # Product sum def prod_sum(inputs0, inputs1): out = 0.0 for inp0, inp1 in zip(inputs0, inputs1): out += inp0 * nn.Variable(inp1.shape).apply(data=inp1) return out # Set inputs for the numerical gradients def set_inputs(inputs0, vinputs): begin = 0 for i in vinputs: end = begin + i.size if i.need_grad == True: i.d = inputs0[begin:end].reshape(i.shape) begin = end # Gradient penalty on grads used for computing numerical gradients def obj_func(inputs0, gp2, vinputs): set_inputs(inputs0, vinputs) gp2.forward() return gp2.d.copy() # # Half test # if not disable_half_test: # finputs = create_variables(inputs, backward) # hinputs = create_variables(inputs, backward) # half_test(rng, func, finputs, hinputs, func_args, # func_kwargs, backward, ctx, func_name, atol=atol_half) # Create input variables vinputs = create_variables(inputs, backward) # --- Forward test --- # # Zero grads zero_grads(vinputs) # Forward/Backward on the forward graph voutputs = [ F.sigmoid(x) for x in force_list(func(*(vinputs + func_args), **func_kwargs)) ] agrad_outputs = create_grad_outputs(voutputs) o = prod_sum(voutputs, agrad_outputs) o.forward() o.backward() # clear_buffer=True) # Grads voutputs = voutputs vinputs = list(filter(lambda vi: vi is not None, vinputs)) agrad_outputs = agrad_outputs grads = nn.grad(voutputs, vinputs, agrad_outputs) grads = list(filter(lambda x: x is not None, grads)) o = F.sink(*grads) o.forward() # Check forward for vi, go in zip(vinputs, grads): if vi.need_grad is False: continue fgrads = vi.g bgrads = go.d assert_allclose(fgrads, bgrads, atol=atol_f) # TODO: 1. Pass function argument directly to backward functions. # TODO: 2. should be changed for the simplier form by simply testing BackwardFunction # --- Backward (accum = False) test --- # # Zero grads zero_grads(vinputs) # Compute analytical grads gp2 = gradient_penalty2(grads) gp2.forward() gp2.backward(clear_buffer=True) analytical_grads = np.concatenate( [vi.g.copy().flatten() for vi in vinputs]) analytical_grads0 = analytical_grads # Compute numerical grads inputs0 = np.concatenate( [inp.flatten() for inp in inputs if inp is not None]) numerical_grads = approx_fprime(inputs0, obj_func, dstep, gp2, vinputs) # Check backward assert_allclose(analytical_grads, numerical_grads, atol=atol_b) # --- Backward (accum = True) test --- # # Random grads rand_grads = [randn(rng, *vi.shape) for vi in vinputs] fill_grads(vinputs, rand_grads) # Compute analytical grads gp2.forward() gp2.backward(clear_buffer=True) analytical_grads = np.concatenate( [vi.g.copy().flatten() for vi in vinputs]) rand_grads = np.concatenate([ rg.flatten() if isinstance(rg, np.ndarray) else np.array(rg).reshape( (1, )) for rg in rand_grads ]) analytical_grads -= rand_grads # Check backward assert_allclose(analytical_grads, analytical_grads0, atol=atol_accum)
x_fake = generator(z, test=False) print(x_fake) # Prob for fake sample print("# Prob for fake sample") p_fake = discriminator(x_fake) print(p_fake) # Prob for real sample p_real = discriminator(x_real) # WGAN loss print("# WGAN loss") loss_gen = gan_loss(p_fake) print(loss_gen) loss_dis = gan_loss(p_fake, p_real) print(loss_dis) # Gradient penalty print("# Gradient penalty") x_rmix = eps * x_real + (1.0 - eps) * x_fake p_rmix = discriminator(x_rmix) grads = nn.grad([p_rmix], [x_rmix]) print(grads) l2norms = [F.sum(g**2.0, [1, 2, 3])**0.5 for g in grads] gp = sum([F.mean((l - 1.0)**2.0) for l in l2norms]) loss_dis += gp gp.forward() gp.backward()
def infl_icml(model_info_dict, file_dir_dict, use_all_params, need_evaluate, alpha): num_epochs = 2 # params lr = 0.005 seed = model_info_dict['seed'] net_func = model_info_dict['net_func'] batch_size = model_info_dict['batch_size'] test_batch_size = 1000 target_epoch = model_info_dict['num_epochs'] # files and dirs save_dir = file_dir_dict['save_dir'] infl_filename = file_dir_dict['infl_filename'] final_model_name = file_dir_dict['model_filename'] final_model_path = os.path.join(save_dir, 'epoch%02d' % (target_epoch - 1), 'weights', final_model_name) input_dir_name = os.path.dirname(file_dir_dict['train_csv']) # setup trainset, valset, image_shape, n_classes, ntr, nval = init_dataset( file_dir_dict['train_csv'], file_dir_dict['val_csv'], seed) n_channels, _h, _w = image_shape resize_size = get_image_size((_h, _w)) idx_train = get_indices(ntr, seed) idx_val = get_indices(nval, seed) nn.load_parameters(final_model_path) trained_params = nn.get_parameters(grad_only=False) test = True grad_model = functools.partial(setup_model, net_func=net_func, n_classes=n_classes, n_channels=n_channels, resize_size=resize_size, test=test, reduction='mean') solver = S.Momentum(lr=lr, momentum=0.9) solver.set_parameters(trained_params) # gradient u = compute_gradient(grad_model, solver, valset, test_batch_size, idx_val, resize_size) # Hinv * u with SGD seed_train = 0 v = dict() for key, param in nn.get_parameters(grad_only=False).items(): v[key] = nn.Variable(param.d.shape, need_grad=True) v[key].d = 0 v[key].g = 0 solver.set_parameters(v) loss_train = [] loss_fn = None for epoch in range(num_epochs): # training seed_train = 0 np.random.seed(epoch) idx = get_batch_indices(ntr, batch_size, seed=epoch) for j, i in enumerate(idx): seeds = list(range(seed_train, seed_train + i.size)) seed_train += i.size X, y = get_batch_data(trainset, idx_train, i, resize_size, test=False, seeds=seeds) _, loss_fn, input_image = adjust_batch_size( grad_model, len(X), loss_fn) input_image["image"].d = X input_image["label"].d = y loss_fn.forward() grad_params = nn.grad(loss_fn, [ param for param in nn.get_parameters(grad_only=False).values() ]) vg = 0 for vv, g in zip(v.values(), grad_params): vg += F.sum(vv * g) for parameters in trained_params.values(): parameters.grad.zero() vgrad_params = nn.grad(vg, [ param for param in nn.get_parameters(grad_only=False).values() ]) loss_i = 0 for vgp, vv, uu in zip(vgrad_params, v.values(), u.values()): loss_i += 0.5 * F.sum(vgp * vv + alpha * vv * vv) - F.sum( uu * vv) loss_i.forward() solver.zero_grad() loss_i.backward(clear_buffer=True) solver.update() loss_train.append(loss_i.d.copy()) # influence infl_dict = dict() infl = np.zeros(ntr) for i in tqdm(range(ntr), desc='calc influence (3/3 steps)'): csv_idx = idx_train[i] file_name = trainset.get_filepath_to_data(csv_idx) file_name = os.path.join(input_dir_name, file_name) file_name = os.path.normpath(file_name) X, y = get_data(trainset, idx_train[i], resize_size, True, seed=i) _, loss_fn, input_image = adjust_batch_size(grad_model, len(X), loss_fn) input_image["image"].d = X input_image["label"].d = y loss_fn.forward() for parameters in trained_params.values(): parameters.grad.zero() loss_fn.backward(clear_buffer=True) infl_i = 0 for j, param in enumerate(nn.get_parameters(grad_only=False).values()): infl_i += (param.g.copy() * list(v.values())[j].d.copy()).sum() infl[i] = -infl_i / ntr infl_dict[csv_idx] = [file_name, y, infl[i]] infl_list = [val + [key] for key, val in infl_dict.items()] infl_list = sorted(infl_list, key=lambda x: (x[-2])) # save header = ['x:image', 'y:label', 'influence', 'datasource_index'] data_type = 'object,int,float,int' if need_evaluate: save_infl_for_analysis(infl_list, use_all_params, save_dir, infl_filename, epoch, header, data_type) save_to_csv(filename=infl_filename, header=header, list_to_save=infl_list, data_type=data_type)
def infl_sgd(model_info_dict, file_dir_dict, use_all_params, need_evaluate): # params lr = model_info_dict['lr'] seed = model_info_dict['seed'] net_func = model_info_dict['net_func'] batch_size = model_info_dict['batch_size'] end_epoch = model_info_dict['end_epoch'] target_epoch = model_info_dict['num_epochs'] # files and dirs save_dir = file_dir_dict['save_dir'] info_filename = file_dir_dict['info_filename'] infl_filename = file_dir_dict['infl_filename'] final_model_name = file_dir_dict['model_filename'] final_model_path = os.path.join(save_dir, 'epoch%02d' % (target_epoch - 1), 'weights', final_model_name) input_dir_name = os.path.dirname(file_dir_dict['train_csv']) # setup trainset, valset, image_shape, n_classes, ntr, nval = init_dataset( file_dir_dict['train_csv'], file_dir_dict['val_csv'], seed) n_channels, _h, _w = image_shape resize_size = get_image_size((_h, _w)) idx_train = get_indices(ntr, seed) idx_val = get_indices(nval, seed) nn.load_parameters(final_model_path) trained_params = nn.get_parameters(grad_only=False) test = True grad_model = functools.partial(setup_model, net_func=net_func, n_classes=n_classes, n_channels=n_channels, resize_size=resize_size, test=test, reduction='sum') solver = S.Sgd(lr=lr) solver.set_parameters(trained_params) # gradient u = compute_gradient(grad_model, solver, valset, batch_size, idx_val, target_epoch, resize_size) test = False infl_model = functools.partial(setup_model, net_func=net_func, n_classes=n_classes, n_channels=n_channels, resize_size=resize_size, test=test) # influence infl_dict = {} info = np.load(os.path.join(save_dir, info_filename), allow_pickle=True) loss_fn = None for epoch in tqdm(range(target_epoch - 1, end_epoch - 1, -1), desc='calc influence (3/3 steps)'): for step_info in info[epoch][::-1]: idx, seeds, lr, step = step_info['idx'], step_info[ 'seeds'], step_info['lr'], step_info['step'] fn = select_modelfile_for_infl(use_all_params, final_model_path, save_dir, epoch, step) _, loss_fn, input_image = adjust_batch_size( infl_model, solver, 1, loss_fn) nn.load_parameters(fn) params = nn.get_parameters(grad_only=False) solver = S.Sgd(lr=lr) solver.set_parameters(params) X = [] y = [] for i, seed in zip(idx, seeds): i = int(i) image, label = get_data(trainset, idx_train[i], resize_size, test, seed=seed) X.append(image) y.append(label) input_image["image"].d = image input_image["label"].d = label loss_fn.forward() solver.zero_grad() loss_fn.backward(clear_buffer=True) csv_idx = idx_train[i] infl = infl_dict.get(csv_idx, [0.0])[-1] for j, (key, param) in enumerate( nn.get_parameters(grad_only=False).items()): infl += lr * (u[key].d * param.g).sum() / idx.size # store infl file_name = trainset.get_filepath_to_data(csv_idx) file_name = os.path.join(input_dir_name, file_name) file_name = os.path.normpath(file_name) infl_dict[csv_idx] = [file_name, label, infl] # update u _, loss_fn, input_image = adjust_batch_size( infl_model, solver, len(idx), loss_fn) input_image["image"].d = X input_image["label"].d = np.array(y).reshape(-1, 1) loss_fn.forward() params = nn.get_parameters(grad_only=False) grad_params = {} for key, p in zip(params.keys(), nn.grad([loss_fn], params.values())): grad_params[key] = p ug = 0 # compute H[t]u[t] for key, uu in u.items(): try: ug += F.sum(uu * grad_params[key]) except TypeError: # cannot calc grad with batch normalization runnning mean and var pass ug.forward() solver.zero_grad() ug.backward(clear_buffer=True) for j, (key, param) in enumerate( nn.get_parameters(grad_only=False).items()): u[key].d -= lr * param.g / idx.size # sort by influence score infl_list = [val + [key] for key, val in infl_dict.items()] infl_list = sorted(infl_list, key=lambda x: (x[-2])) # save header = ['x:image', 'y:label', 'influence', 'datasource_index'] data_type = 'object,int,float,int' if need_evaluate: save_infl_for_analysis(infl_list, use_all_params, save_dir, infl_filename, epoch, header, data_type) save_to_csv(filename=infl_filename, header=header, list_to_save=infl_list, data_type=data_type)
def sdf_feature_grad(implicit_network, x, conf): y = implicit_network(x, initial_sphere_radius=conf.initial_sphere_radius) sdf = y[..., 0:1] feature = y[..., 1:] grad = nn.grad([sdf], [x])[0] return sdf, feature, grad
def train(args): # Context ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) # Args latent = args.latent maps = args.maps batch_size = args.batch_size image_size = args.image_size lambda_ = args.lambda_ # Model # generator loss z = nn.Variable([batch_size, latent]) x_fake = generator(z, maps=maps, up=args.up).apply(persistent=True) p_fake = discriminator(x_fake, maps=maps) loss_gen = gan_loss(p_fake).apply(persistent=True) # discriminator loss p_fake = discriminator(x_fake, maps=maps) x_real = nn.Variable([batch_size, 3, image_size, image_size]) p_real = discriminator(x_real, maps=maps) loss_dis = gan_loss(p_fake, p_real).apply(persistent=True) # gradient penalty eps = F.rand(shape=[batch_size, 1, 1, 1]) x_rmix = eps * x_real + (1.0 - eps) * x_fake p_rmix = discriminator(x_rmix, maps=maps) x_rmix.need_grad = True # Enabling gradient computation for double backward grads = nn.grad([p_rmix], [x_rmix]) l2norms = [F.sum(g**2.0, [1, 2, 3])**0.5 for g in grads] gp = sum([F.mean((l - 1.0)**2.0) for l in l2norms]) loss_dis += lambda_ * gp # generator with fixed value for test z_test = nn.Variable.from_numpy_array(np.random.randn(batch_size, latent)) x_test = generator(z_test, maps=maps, test=True, up=args.up).apply(persistent=True) # Solver solver_gen = S.Adam(args.lrg, args.beta1, args.beta2) solver_dis = S.Adam(args.lrd, args.beta1, args.beta2) with nn.parameter_scope("generator"): params_gen = nn.get_parameters() solver_gen.set_parameters(params_gen) with nn.parameter_scope("discriminator"): params_dis = nn.get_parameters() solver_dis.set_parameters(params_dis) # Monitor monitor = Monitor(args.monitor_path) monitor_loss_gen = MonitorSeries("Generator Loss", monitor, interval=10) monitor_loss_cri = MonitorSeries("Negative Critic Loss", monitor, interval=10) monitor_time = MonitorTimeElapsed("Training Time", monitor, interval=10) monitor_image_tile_train = MonitorImageTile("Image Tile Train", monitor, num_images=batch_size, interval=1, normalize_method=denormalize) monitor_image_tile_test = MonitorImageTile("Image Tile Test", monitor, num_images=batch_size, interval=1, normalize_method=denormalize) # Data Iterator di = data_iterator_cifar10(batch_size, True) # Train loop for i in range(args.max_iter): # Train discriminator x_fake.need_grad = False # no need backward to generator for _ in range(args.n_critic): solver_dis.zero_grad() x_real.d = di.next()[0] / 127.5 - 1.0 z.d = np.random.randn(batch_size, latent) loss_dis.forward(clear_no_need_grad=True) loss_dis.backward(clear_buffer=True) solver_dis.update() # Train generator x_fake.need_grad = True # need backward to generator solver_gen.zero_grad() z.d = np.random.randn(batch_size, latent) loss_gen.forward(clear_no_need_grad=True) loss_gen.backward(clear_buffer=True) solver_gen.update() # Monitor monitor_loss_gen.add(i, loss_gen.d) monitor_loss_cri.add(i, -loss_dis.d) monitor_time.add(i) # Save if i % args.save_interval == 0: monitor_image_tile_train.add(i, x_fake) monitor_image_tile_test.add(i, x_test) nn.save_parameters( os.path.join(args.monitor_path, "params_{}.h5".format(i))) # Last x_test.forward(clear_buffer=True) nn.save_parameters( os.path.join(args.monitor_path, "params_{}.h5".format(i))) monitor_image_tile_train.add(i, x_fake) monitor_image_tile_test.add(i, x_test)
def test_dropout_double_backward(p, seed, ctx, func_name): from nnabla.backward_functions import registry from nnabla._dropout_workaround import _get_dropout_mask # dropout_backward depends on Dropout. The dependency must be kept by the # the execution order. # 1. Dropout::forward (A mask of dropout is calculated.) # 2. The forward of dropout_backward (The mask is used.) # 3. The backward of dropout_backward (The mask is used.) # 4. Dropout::backward (The mask is used, and then cleared.) # This order must be kept when using nnabla.grad. In the current # implementation, GradEndFunction keeps this order. atol_f = 1e-4 with nn.context_scope(ctx): rng = np.random.RandomState(seed) init_x = rng.randn(2, 3, 4).astype(np.float32) * 2 init_dy = rng.randn(*init_x.shape).astype(init_x.dtype) init_dy_for_grad = rng.randn(*init_x.shape).astype(init_x.dtype) init_dx = rng.randn(*init_x.shape).astype(init_x.dtype) init_for_dx2 = rng.randn(*init_x.shape).astype(init_x.dtype) # # A. Test mask passing # # Skip p=0 because, in the case, dropout does not happen. mask does not # change the results. if p != 0: with pytest.raises(RuntimeError): x = nn.Variable.from_numpy_array(init_x).apply(need_grad=True) dy = nn.Variable.from_numpy_array(init_dy).apply( need_grad=True) # y = F.dropout(x, p, seed) # Dropout is required to compute mask. dx = registry['Dropout']([dy, x], p, seed) # Note: y.forward() is required for dx.forward(). However this test # is skipped because the random results are randomly matched # between dx.forward() with and without y.forward(). Therefore # The test result is not reproduced. # # B. Unit test of dropout_backward # # Graph construction x = nn.Variable.from_numpy_array(init_x).apply(need_grad=True) dy = nn.Variable.from_numpy_array(init_dy).apply(need_grad=True) y = F.dropout(x, p, seed) # Dropout is required to compute mask. dx = registry['Dropout']([dy, x], p, seed) # Execution y.forward() # Dropout is required to compute mask. # (y!=0) cannot be used when x includes 0. mask = _get_dropout_mask(x).d dx.forward() # Note: dropout_backward is a composite function. dx.parent is just # a just composing function like MulScalar. Unit tests using # dx.parent.forward and dx.parent.backward are meaningless. # By the same reason, test of accumulation is nonsense. # Reference ref_dx = ref_dropout_backward(init_dy, mask, p) # Test assert_allclose(dx.d, ref_dx, atol=atol_f, err_msg="Wrong output values of dropout_backward.") # # C. Test the forward of dropout_backward by using nnabla.grad # # Graph construction x = nn.Variable.from_numpy_array(init_x).apply(need_grad=True) y = F.dropout(x, p, seed) dx = nn.grad(y, x, grad_outputs=[init_dy_for_grad])[0] # Note: In NNabla 1.22.0, if use grad_outputs=X, nn.grad separate # np.ndarray X into small arrays by self._force_list. # For example, X = np.array([[5, 6], [7, 8]]) is separated # into [np.array([5, 6]), np.array(7, 8)]. Then Mul2 inserted by # nn.grad uses np.array([5, 6]) as dy, and broadcasts it to # the np.array([[5, 6], [5, 6]]). Finally, the forward execution # is finished, but the result values are wrong. # Execution dx.forward(clear_buffer=True) # Reference mask = _get_dropout_mask(x).d ref_dx = ref_dropout_backward(init_dy_for_grad, mask, p) # Test assert_allclose(dx.d, ref_dx, atol=atol_f, err_msg="Wrong output values of Dropout of nn.grad.") # # D. Test the backward of dropout_backward by using nnabla.grad # # The numerical grad by using scipy.approx_fprime cannot be performed # because Dropout has randomness and changes the results during # the repeated forward computation. # Graph construction x = nn.Variable.from_numpy_array(init_x).apply(need_grad=True) y = F.dropout(x, p, seed) dx = nn.grad(y, x, grad_outputs=[init_dy_for_grad])[0] y_dx = y + dx # replaceable with F.sink(y, dx, one_input_grad=False) # Execution x.g = init_dx # Accumulation y_dx.forward(clear_no_need_grad=True) mask = _get_dropout_mask(x).d # Store mask before the clear y_dx.backward(init_for_dx2, clear_buffer=True) # Reference ref_dx = ref_dropout_double_backward(init_for_dx2, mask, p) + init_dx # Test assert_allclose(x.g, ref_dx, atol=atol_f, err_msg="Wrong output values of double backward of " "Dropout by nn.grad.")
def inner_train_test(inputa, inputb, labela, labelb, data_generator, meta_training, args): lossesa, lossesb, accuraciesa, accuraciesb = [], [], [], [] if meta_training: num_updates = args.num_updates update_lr = args.train_update_lr else: num_updates = args.test_num_updates update_lr = args.update_lr # Training for inp in data_generator.next(): inputa.d, inputb.d, labela.d, labelb.d = inp # Initialize network with nn.parameter_scope('meta'): resulta = net(inputa, labela, True, args) resultb = net(inputb, labelb, True, args) fast_weights = nn.get_parameters() # For saving training accuracies resulta[0].persistent = True resulta[1].persistent = True task_lossa_var = [ resulta[0], ] task_accuracya_var = [ resulta[1], ] # Inner loop for j in range(num_updates): grad_list = nn.grad(resulta[0], fast_weights.values()) for ind, key in enumerate(fast_weights.keys()): if grad_list[ind] is None: continue if args.first_order or not meta_training: grad_list[ind].need_grad = False fast_weights[key] = fast_weights[key] - \ update_lr * grad_list[ind] resulta = net(inputa, labela, True, args, fast_weights) resulta[0].persistent = True resulta[1].persistent = True task_lossa_var.append(resulta[0]) task_accuracya_var.append(resulta[1]) # Loss on queries is calculated only at the end of the inner loop # Following the original implementation, # we always use batch stats for batch normalization even in a test phase resultb = net(inputb, labelb, True, args, fast_weights) # Forward calculation result_all = F.sink(resulta[0], resulta[1], resultb[0], resultb[1]) result_all.forward() if meta_training: # Backward calculation lossb = resultb[0] / data_generator.batch_size lossb.backward( ) # gradients on weights are automatically accumlated task_lossa = [] task_accuracya = [] for j in range(num_updates + 1): task_accuracya_var[j].forward() task_lossa.append(task_lossa_var[j].d) task_accuracya.append(task_accuracya_var[j].d) lossesa.append(task_lossa) lossesb.append(resultb[0].d) accuraciesa.append(task_accuracya) accuraciesb.append(resultb[1].d) return lossesa, lossesb, accuraciesa, accuraciesb
def train(args): if args.c_dim != len(args.selected_attrs): print("c_dim must be the same as the num of selected attributes. Modified c_dim.") args.c_dim = len(args.selected_attrs) # Dump the config information. config = dict() print("Used config:") for k in args.__dir__(): if not k.startswith("_"): config[k] = getattr(args, k) print("'{}' : {}".format(k, getattr(args, k))) # Prepare Generator and Discriminator based on user config. generator = functools.partial( model.generator, conv_dim=args.g_conv_dim, c_dim=args.c_dim, num_downsample=args.num_downsample, num_upsample=args.num_upsample, repeat_num=args.g_repeat_num) discriminator = functools.partial(model.discriminator, image_size=args.image_size, conv_dim=args.d_conv_dim, c_dim=args.c_dim, repeat_num=args.d_repeat_num) x_real = nn.Variable( [args.batch_size, 3, args.image_size, args.image_size]) label_org = nn.Variable([args.batch_size, args.c_dim, 1, 1]) label_trg = nn.Variable([args.batch_size, args.c_dim, 1, 1]) with nn.parameter_scope("dis"): dis_real_img, dis_real_cls = discriminator(x_real) with nn.parameter_scope("gen"): x_fake = generator(x_real, label_trg) x_fake.persistent = True # to retain its value during computation. # get an unlinked_variable of x_fake x_fake_unlinked = x_fake.get_unlinked_variable() with nn.parameter_scope("dis"): dis_fake_img, dis_fake_cls = discriminator(x_fake_unlinked) # ---------------- Define Loss for Discriminator ----------------- d_loss_real = (-1) * loss.gan_loss(dis_real_img) d_loss_fake = loss.gan_loss(dis_fake_img) d_loss_cls = loss.classification_loss(dis_real_cls, label_org) d_loss_cls.persistent = True # Gradient Penalty. alpha = F.rand(shape=(args.batch_size, 1, 1, 1)) x_hat = F.mul2(alpha, x_real) + \ F.mul2(F.r_sub_scalar(alpha, 1), x_fake_unlinked) with nn.parameter_scope("dis"): dis_for_gp, _ = discriminator(x_hat) grads = nn.grad([dis_for_gp], [x_hat]) l2norm = F.sum(grads[0] ** 2.0, axis=(1, 2, 3)) ** 0.5 d_loss_gp = F.mean((l2norm - 1.0) ** 2.0) # total discriminator loss. d_loss = d_loss_real + d_loss_fake + args.lambda_cls * \ d_loss_cls + args.lambda_gp * d_loss_gp # ---------------- Define Loss for Generator ----------------- g_loss_fake = (-1) * loss.gan_loss(dis_fake_img) g_loss_cls = loss.classification_loss(dis_fake_cls, label_trg) g_loss_cls.persistent = True # Reconstruct Images. with nn.parameter_scope("gen"): x_recon = generator(x_fake_unlinked, label_org) x_recon.persistent = True g_loss_rec = loss.recon_loss(x_real, x_recon) g_loss_rec.persistent = True # total generator loss. g_loss = g_loss_fake + args.lambda_rec * \ g_loss_rec + args.lambda_cls * g_loss_cls # -------------------- Solver Setup --------------------- d_lr = args.d_lr # initial learning rate for Discriminator g_lr = args.g_lr # initial learning rate for Generator solver_dis = S.Adam(alpha=args.d_lr, beta1=args.beta1, beta2=args.beta2) solver_gen = S.Adam(alpha=args.g_lr, beta1=args.beta1, beta2=args.beta2) # register parameters to each solver. with nn.parameter_scope("dis"): solver_dis.set_parameters(nn.get_parameters()) with nn.parameter_scope("gen"): solver_gen.set_parameters(nn.get_parameters()) # -------------------- Create Monitors -------------------- monitor = Monitor(args.monitor_path) monitor_d_cls_loss = MonitorSeries( 'real_classification_loss', monitor, args.log_step) monitor_g_cls_loss = MonitorSeries( 'fake_classification_loss', monitor, args.log_step) monitor_loss_dis = MonitorSeries( 'discriminator_loss', monitor, args.log_step) monitor_recon_loss = MonitorSeries( 'reconstruction_loss', monitor, args.log_step) monitor_loss_gen = MonitorSeries('generator_loss', monitor, args.log_step) monitor_time = MonitorTimeElapsed("Training_time", monitor, args.log_step) # -------------------- Prepare / Split Dataset -------------------- using_attr = args.selected_attrs dataset, attr2idx, idx2attr = get_data_dict(args.attr_path, using_attr) random.seed(313) # use fixed seed. random.shuffle(dataset) # shuffle dataset. test_dataset = dataset[-2000:] # extract 2000 images for test if args.num_data: # Use training data partially. training_dataset = dataset[:min(args.num_data, len(dataset) - 2000)] else: training_dataset = dataset[:-2000] print("Use {} images for training.".format(len(training_dataset))) # create data iterators. load_func = functools.partial(stargan_load_func, dataset=training_dataset, image_dir=args.celeba_image_dir, image_size=args.image_size, crop_size=args.celeba_crop_size) data_iterator = data_iterator_simple(load_func, len( training_dataset), args.batch_size, with_file_cache=False, with_memory_cache=False) load_func_test = functools.partial(stargan_load_func, dataset=test_dataset, image_dir=args.celeba_image_dir, image_size=args.image_size, crop_size=args.celeba_crop_size) test_data_iterator = data_iterator_simple(load_func_test, len( test_dataset), args.batch_size, with_file_cache=False, with_memory_cache=False) # Keep fixed test images for intermediate translation visualization. test_real_ndarray, test_label_ndarray = test_data_iterator.next() test_label_ndarray = test_label_ndarray.reshape( test_label_ndarray.shape + (1, 1)) # -------------------- Training Loop -------------------- one_epoch = data_iterator.size // args.batch_size num_max_iter = args.max_epoch * one_epoch for i in range(num_max_iter): # Get real images and labels. real_ndarray, label_ndarray = data_iterator.next() label_ndarray = label_ndarray.reshape(label_ndarray.shape + (1, 1)) label_ndarray = label_ndarray.astype(float) x_real.d, label_org.d = real_ndarray, label_ndarray # Generate target domain labels randomly. rand_idx = np.random.permutation(label_org.shape[0]) label_trg.d = label_ndarray[rand_idx] # ---------------- Train Discriminator ----------------- # generate fake image. x_fake.forward(clear_no_need_grad=True) d_loss.forward(clear_no_need_grad=True) solver_dis.zero_grad() d_loss.backward(clear_buffer=True) solver_dis.update() monitor_loss_dis.add(i, d_loss.d.item()) monitor_d_cls_loss.add(i, d_loss_cls.d.item()) monitor_time.add(i) # -------------- Train Generator -------------- if (i + 1) % args.n_critic == 0: g_loss.forward(clear_no_need_grad=True) solver_dis.zero_grad() solver_gen.zero_grad() x_fake_unlinked.grad.zero() g_loss.backward(clear_buffer=True) x_fake.backward(grad=None) solver_gen.update() monitor_loss_gen.add(i, g_loss.d.item()) monitor_g_cls_loss.add(i, g_loss_cls.d.item()) monitor_recon_loss.add(i, g_loss_rec.d.item()) monitor_time.add(i) if (i + 1) % args.sample_step == 0: # save image. save_results(i, args, x_real, x_fake, label_org, label_trg, x_recon) if args.test_during_training: # translate images from test dataset. x_real.d, label_org.d = test_real_ndarray, test_label_ndarray label_trg.d = test_label_ndarray[rand_idx] x_fake.forward(clear_no_need_grad=True) save_results(i, args, x_real, x_fake, label_org, label_trg, None, is_training=False) # Learning rates get decayed if (i + 1) > int(0.5 * num_max_iter) and (i + 1) % args.lr_update_step == 0: g_lr = max(0, g_lr - (args.lr_update_step * args.g_lr / float(0.5 * num_max_iter))) d_lr = max(0, d_lr - (args.lr_update_step * args.d_lr / float(0.5 * num_max_iter))) solver_gen.set_learning_rate(g_lr) solver_dis.set_learning_rate(d_lr) print('learning rates decayed, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr)) # Save parameters and training config. param_name = 'trained_params_{}.h5'.format( datetime.datetime.today().strftime("%m%d%H%M")) param_path = os.path.join(args.model_save_path, param_name) nn.save_parameters(param_path) config["pretrained_params"] = param_name with open(os.path.join(args.model_save_path, "training_conf_{}.json".format(datetime.datetime.today().strftime("%m%d%H%M"))), "w") as f: json.dump(config, f) # -------------------- Translation on test dataset -------------------- for i in range(args.num_test): real_ndarray, label_ndarray = test_data_iterator.next() label_ndarray = label_ndarray.reshape(label_ndarray.shape + (1, 1)) label_ndarray = label_ndarray.astype(float) x_real.d, label_org.d = real_ndarray, label_ndarray rand_idx = np.random.permutation(label_org.shape[0]) label_trg.d = label_ndarray[rand_idx] x_fake.forward(clear_no_need_grad=True) save_results(i, args, x_real, x_fake, label_org, label_trg, None, is_training=False)
def disc_r1_loss(real_disc_out, real_img): gradient = nn.grad([F.sum(real_disc_out)], [real_img])[0] gradient_penalty = F.pow_scalar(gradient, 2) gradient_penalty = F.reshape(gradient_penalty, (gradient.shape[0], -1)) return F.mean(F.sum(gradient_penalty, axis=1))