def test_graph_logreg(seed): rng = np.random.RandomState(seed) x = nn.Variable([2, 3, 4], need_grad=True) w = nn.Variable([12, 5], need_grad=True) b = nn.Variable([5], need_grad=True) t = nn.Variable([2, 1]) x.d = rng.randn(*x.shape) w.d = rng.randn(*w.shape) b.d = rng.randn(*b.shape) t.d = rng.randint(0, 5, size=t.shape) nn.set_default_context(nn.Context()) # Forwardprop by definintion with nn.auto_forward(): z = F.affine(x, w, b, 1) l = F.softmax_cross_entropy(z, t, 1) L = F.mean(l) # Backprop # Diff should be initialized since they are always accumulated x.g = 0 w.g = 0 b.g = 0 L.backward(clear_buffer=True) x.g = rng.randn(*x.shape) inputs = [x, w, b] from nbla_test_utils import \ compute_analytical_and_numerical_grad_graph as grads agrad, ngrad = grads(L, inputs, 1e-3) assert np.allclose(ngrad, agrad, atol=1e-2)
def attention(k, q, v, div_dim=True, softmax=True): v_shape = v.shape k = F.identity(k) q = F.identity(q) k = F.reshape(k, (k.shape[0], np.prod(k.shape[1:]))) q = F.reshape(q, (q.shape[0], np.prod(q.shape[1:]))) v = q # F.reshape is inplace cf = F.affine(q, F.transpose(k, (1, 0))) if div_dim: dim = np.prod(v_shape[1:]) cf /= np.sqrt(dim) h = cf if softmax: h = F.softmax(h) h = F.affine(h, v)x h = F.reshape(h, v_shape) return h
def connect(self, fname, inputs, args): if fname in ['Convolution', 'Deconvolution']: # TODO: address leading batch dimension args['channel_last'] = True x = inputs[0] w = inputs[1] b = inputs[2] if len(inputs) == 3 else None scope = self.get_parameter_scope(w) with nn.parameter_scope(scope): wd = w.d.copy().transpose(0, 2, 3, 1) w = nn.parameter.get_parameter_or_create('W_cl', wd.shape, wd) o = F.convolution(x, w, b, **args) elif fname == 'BatchNormalization': # TODO: address leading batch dimension x = inputs[0] beta = inputs[1] gamma = inputs[2] mean = inputs[3] var = inputs[4] args['axes'] = [len(x.shape) - 1] scope = self.get_parameter_scope(beta) with nn.parameter_scope(scope): beta_d = beta.d.copy().transpose(0, 2, 3, 1) gamma_d = gamma.d.copy().transpose(0, 2, 3, 1) mean_d = mean.d.copy().transpose(0, 2, 3, 1) var_d = var.d.copy().transpose(0, 2, 3, 1) beta = nn.parameter.get_parameter_or_create( 'beta_cl', beta_d.shape, beta_d, beta.need_grad) gamma = nn.parameter.get_parameter_or_create( 'gamma_cl', gamma_d.shape, gamma_d, gamma.need_grad) mean = nn.parameter.get_parameter_or_create( 'mean_cl', mean_d.shape, mean_d, mean.need_grad) var = nn.parameter.get_parameter_or_create( 'var_cl', var_d.shape, var_d, var.need_grad) o = F.batch_normalization(x, beta, gamma, mean, var, **args) elif fname in ['MaxPooling', 'AveragePooling', 'SumPooling']: args['channel_last'] = True o = self._call_function(fname, inputs, args) elif fname in ['Concatenate']: args['axis'] = len(inputs[0].shape) - 1 o = self._call_function(fname, inputs, args) elif fname == 'Affine': x = inputs[0] _, h_s, w_s, c_s = inputs[0].shape _, b_s = inputs[1].shape wd = inputs[1].d.copy() wd = np.reshape(wd, (c_s, h_s, w_s, b_s)) wd = np.transpose(wd, (1, 2, 0, 3)) wd = np.reshape(wd, (-1, b_s)) w = nn.parameter.get_parameter_or_create('w_cl', wd.shape, wd, False) b = inputs[2] if len(inputs) == 3 else None o = F.affine(x, w, b, **args) else: o = self._call_function(fname, inputs, args) return o
def spectral_normalization_for_affine(w, itr=1, eps=1e-12, input_axis=1, test=False): W_sn = get_parameter_or_create("W_sn", w.shape, ConstantInitializer(0), False) if test: return W_sn d0 = np.prod(w.shape[0:-1]) # In d1 = np.prod(w.shape[-1]) # Out u0 = get_parameter_or_create("singular-vector", [d1], NormalInitializer(), False) u = F.reshape(u0, [d1, 1]) # Power method for _ in range(itr): # v v = F.affine(w, u) v = F.div2( v, F.pow_scalar(F.sum(F.pow_scalar(v, 2.), keepdims=True) + eps, 0.5)) v = F.reshape(v, [1, d0]) # u u = F.affine(v, w) u = F.div2( u, F.pow_scalar(F.sum(F.pow_scalar(u, 2.), keepdims=True) + eps, 0.5)) u = F.reshape(u, [d1, 1]) # Iterate u = F.identity(u, outputs=[u0.data]) u.persistent = True # No grad u.need_grad = False v.need_grad = False # Spectral normalization wv = F.affine(v, w) sigma = F.affine(wv, u) sigma = F.broadcast(F.reshape(sigma, [1 for _ in range(len(w.shape))]), w.shape) w_sn = F.div2(w, sigma, outputs=[W_sn.data]) w_sn.persistent = True return w_sn
def spectral_normalization_for_conv(w, itr=1, eps=1e-12, test=False): w_shape = w.shape W_sn = get_parameter_or_create("W_sn", w_shape, ConstantInitializer(0), False) if test: return W_sn d0 = w.shape[0] # Out d1 = np.prod(w.shape[1:]) # In w = F.reshape(w, [d0, d1], inplace=False) u0 = get_parameter_or_create("singular-vector", [d0], NormalInitializer(), False) u = F.reshape(u0, [1, d0]) # Power method for _ in range(itr): # v v = F.affine(u, w) v = F.div2( v, F.pow_scalar(F.sum(F.pow_scalar(v, 2.), keepdims=True) + eps, 0.5)) v = F.reshape(v, [d1, 1]) # u u = F.affine(w, v) u = F.div2( u, F.pow_scalar(F.sum(F.pow_scalar(u, 2.), keepdims=True) + eps, 0.5)) u = F.reshape(u, [1, d0]) # Iterate u = F.identity(u, outputs=[u0.data]) u.persistent = True # No grad u.need_grad = False v.need_grad = False # Spectral normalization wv = F.affine(w, v) sigma = F.affine(u, wv) w_sn = F.div2(w, sigma) w_sn = F.reshape(w_sn, w_shape) w_sn = F.identity(w_sn, outputs=[W_sn.data]) w_sn.persistent = True return w_sn
def mapping_network(z, outmaps=512, num_layers=8, net_scope='G_mapping/Dense'): lrmul = 0.01 runtime_coef = 0.00044194172 out = z for i in range(num_layers): with nn.parameter_scope(f'{net_scope}{i}'): W, bias = weight_init_fn(shape=(out.shape[1], outmaps), lrmul=lrmul) out = F.affine(out, W * runtime_coef, bias * lrmul) out = F.mul_scalar(F.leaky_relu(out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) return out
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """ The affine layer, also known as the fully connected layer. Computes .. math:: {\\mathbf y} = {\\mathbf A} {\\mathbf x} + {\\mathbf b}. where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and outputs respectively, and :math:`{\\mathbf A}, {\\mathbf b}` are constants. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)f """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def __call__(self, x, return_encoding_indices=False): x = F.transpose(x, (0, 2, 3, 1)) x_flat = x.reshape((-1, self.embedding_dim)) x_flat_squared = F.broadcast(F.sum(x_flat**2, axis=1, keepdims=True), (x_flat.shape[0], self.num_embedding)) emb_wt_squared = F.transpose( F.sum(self.embedding_weight**2, axis=1, keepdims=True), (1, 0)) distances = x_flat_squared + emb_wt_squared - 2 * \ F.affine(x_flat, F.transpose(self.embedding_weight, (1, 0))) encoding_indices = F.min(distances, only_index=True, axis=1, keepdims=True) encoding_indices.need_grad = False quantized = F.embed( encoding_indices.reshape(encoding_indices.shape[:-1]), self.embedding_weight).reshape(x.shape) if return_encoding_indices: return encoding_indices, F.transpose(quantized, (0, 3, 1, 2)) encodings = F.one_hot(encoding_indices, (self.num_embedding, )) e_latent_loss = F.mean( F.squared_error(quantized.get_unlinked_variable(need_grad=False), x)) q_latent_loss = F.mean( F.squared_error(quantized, x.get_unlinked_variable(need_grad=False))) loss = q_latent_loss + self.commitment_cost * e_latent_loss quantized = x + (quantized - x).get_unlinked_variable(need_grad=False) avg_probs = F.mean(encodings, axis=0) perplexity = F.exp(-F.sum(avg_probs * F.log(avg_probs + 1.0e-10))) return loss, F.transpose(quantized, (0, 3, 1, 2)), perplexity, encodings