def norm_backward(inputs, p=None, axes=None, keep_dims=False): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] if p is None: p = 2.0 axes = list(range(x0.ndim)) if axes is None else force_list(axes) x_abs = F.abs(x0) x_pow = F.pow_scalar(x_abs, p) x_sum = F.sum(x_pow, axes, keepdims=True) # Add axis for mul2 if not keep_dims: shape = list(x0.shape) for a in axes: shape[a] = 1 dy = dy.reshape(shape) x_sign = no_grad(F.sign(x0)) dx = dy * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign return dx
def sample_noise(inpt_size, out_size): _f = lambda x: F.sign(x) * F.pow_scalar(F.abs(x), 0.5) noise = _f(F.randn(shape=(inpt_size + out_size, ))) eps_w = F.batch_matmul(F.reshape(noise[:inpt_size], (1, -1)), F.reshape(noise[inpt_size:], (1, -1)), True) eps_b = noise[inpt_size:] return eps_w, eps_b
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Inputs x0 = inputs[0].data x1 = inputs[1].data dy = inputs[2].data # Outputs dx0 = outputs[0].data dx1 = outputs[1].data # Grads of inputs g_x0 = inputs[0].grad g_x1 = inputs[1].grad g_dy = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dx1 = outputs[1].grad # Computation if prop_down[2]: sign = F.sign(x0 - x1, 1.0) if accum[2]: g_dy += sign * (g_dx0 - g_dx1) else: g_dy.copy_from(sign * (g_dx0 - g_dx1))
def build_model(): x = nn.Variable((batch_size, sentence_length_source)) mask = get_mask(x) y = nn.Variable((batch_size, sentence_length_target)) enc_input = time_distributed(PF.embed)( x, vocab_size_source, embedding_size, name='enc_embeddings') * mask # -> (batch_size, sentence_length_source, embedding_size) dec_input = F.concatenate(F.constant(w2i_target['<bos>'], shape=(batch_size, 1)), y[:, :sentence_length_target - 1], axis=1) dec_input = time_distributed(PF.embed)(dec_input, vocab_size_target, embedding_size, name='dec_embeddings') # -> (batch_size, sentence_length_target, embedding_size) # encoder with nn.parameter_scope('encoder'): enc_output, c, h = lstm(enc_input, hidden, mask=mask, return_sequences=True, return_state=True) # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden) # decoder with nn.parameter_scope('decoder'): dec_output = lstm(dec_input, hidden, initial_state=(c, h), return_sequences=True) # -> (batch_size, sentence_length_target, hidden) attention_output = global_attention(dec_output, enc_output, mask=mask, score='dot') # -> (batch_size, sentence_length_target, hidden) output = F.concatenate(dec_output, attention_output, axis=2) output = time_distributed(PF.affine)(output, vocab_size_target, name='output') # -> (batch_size, sentence_length_target, vocab_size_target) t = F.reshape(y, (batch_size, sentence_length_target, 1)) entropy = time_distributed_softmax_cross_entropy(output, t) mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. count = F.sum(mask, axis=1) entropy *= mask loss = F.mean(F.sum(entropy, axis=1) / count) return x, y, loss
def norm_normalization_backward(inputs, p=None, axes=None, eps=1e-12): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] if p is None: p = 2.0 axes = list(range(x0.ndim)) if axes is None else force_list(axes) x_abs = F.abs(x0) x_pow = F.pow_scalar(x_abs, p) x_sum = F.sum(x_pow, axes, keepdims=True) # x_norm = x_sum ** (1./p) # Div2 backward dx = dy * x_sum**(-1. / p) dx_norm = -dy * x0 * x_sum**(-2. / p) dx_norm = sum_for_arithmetics(dx_norm, x_sum) # Norm backward x_sign = no_grad(F.sign(x0)) dx += dx_norm * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign return dx
def build_model(): x = nn.Variable((batch_size, sentence_length_source)) input_mask = F.sign( F.reshape(F.slice(x), (batch_size, sentence_length_source, 1))) y = nn.Variable((batch_size, sentence_length_target)) enc_input = time_distributed(PF.embed)(x, vocab_size_source, embedding_size, name='enc_embeddings') #*input_mask # -> (batch_size, sentence_length_source, embedding_size) dec_input = time_distributed(PF.embed)(y, vocab_size_target, embedding_size, name='dec_embeddings') # -> (batch_size, sentence_length_target, embedding_size) # encoder with nn.parameter_scope('encoder'): output, c, h = LSTMEncoder(enc_input, hidden, return_sequences=True, return_state=True) # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden) # decoder output = LSTMAttentionDecoder(dec_input, output, initial_state=(c, h), return_sequences=True, name='decoder') # -> (batch_size, sentence_length_target, hidden) output = time_distributed(PF.affine)(output, vocab_size_target, name='output') # -> (batch_size, sentence_length_target, vocab_size_target) t = F.reshape(F.slice(y), (batch_size, sentence_length_target, 1)) entropy = time_distributed_softmax_cross_entropy(output, t) mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. count = F.sum(mask, axis=1) entropy *= mask loss = F.mean(F.sum(entropy, axis=1) / count) return x, y, loss
def combination(sample_num, choise_num): x = F.rand(shape=(sample_num, )) x_indices = nn.Variable.from_numpy_array(np.arange(sample_num, ) + 1) y_top_k = F.top_k_data(x, k=choise_num, reduce=False, base_axis=0) y_top_k_sign = F.sign(y_top_k, alpha=0) y_top_k_indices = F.top_k_data(y_top_k_sign * x_indices, k=choise_num, base_axis=0) return y_top_k_indices
def build_model(train=True, get_embeddings=False): x = nn.Variable((batch_size, sentence_length, ptb_dataset.word_length)) mask = expand_dims(F.sign(x), axis=-1) t = nn.Variable((batch_size, sentence_length)) with nn.parameter_scope('char_embedding'): h = PF.embed(x, char_vocab_size, char_embedding_dim) * mask h = F.transpose(h, (0, 3, 1, 2)) output = [] for f, f_size in zip(filters, filster_sizes): _h = PF.convolution(h, f, kernel=(1, f_size), pad=(0, f_size//2), name='conv_{}'.format(f_size)) _h = F.max_pooling(_h, kernel=(1, ptb_dataset.word_length)) output.append(_h) h = F.concatenate(*output, axis=1) h = F.transpose(h, (0, 2, 1, 3)) mask = get_mask(F.sum(x, axis=2)) embeddings = F.reshape(h, (batch_size, sentence_length, sum(filters))) * mask if get_embeddings: return x, embeddings with nn.parameter_scope('highway1'): h = time_distributed(highway)(embeddings) with nn.parameter_scope('highway2'): h = time_distributed(highway)(h) with nn.parameter_scope('lstm1'): h = lstm(h, lstm_size, mask=mask, return_sequences=True) with nn.parameter_scope('lstm2'): h = lstm(h, lstm_size, mask=mask, return_sequences=True) with nn.parameter_scope('hidden'): h = F.relu(time_distributed(PF.affine)(h, lstm_size)) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('output'): y = time_distributed(PF.affine)(h, word_vocab_size) mask = F.sign(t) # do not predict 'pad'. entropy = time_distributed_softmax_cross_entropy(y, expand_dims(t, axis=-1)) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) return x, t, loss
def my_less_scalar(_x, _scalar): # input # _x : type=nn.Variable # _scalar : type=float # output # flags : type=nn.Variable, same shape with _x temp = F.r_sub_scalar(_x, _scalar) temp = F.sign(temp, alpha=0) flags = F.relu(temp) return flags
def get_mask(x: nn.Variable) -> nn.Variable: assert len(x.shape) == 2 batch_size, max_len = x.shape mask = expand_dims(F.sign(x), axis=-1) return mask
shuffle=True, with_file_cache=False) valid_data_iter = data_iterator_simple(load_valid_func, len(x_valid), batch_size, shuffle=True, with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) t = nn.Variable((batch_size, sentence_length, 1)) h = PF.embed(x, vocab_size, embedding_size) h = LSTM(h, hidden, return_sequences=True) h = TimeDistributed(PF.affine)(h, hidden, name='hidden') y = TimeDistributed(PF.affine)(h, vocab_size, name='output') mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor('./tmp-lstmlm') monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1) monitor_perplexity_valid = MonitorSeries('perplexity_valid', monitor, interval=1)
def one_hot_combination(sample_num, choise_num): x = F.rand(shape=(sample_num, )) y_top_k = F.top_k_data(x, k=choise_num, reduce=False, base_axis=0) y_top_k_sign = F.sign(y_top_k, alpha=0) return y_top_k_sign
def parametric_pow2_quantize_xmin_xmax(x, sign=True, with_zero=True, xmin_init=2**-7, xmin_min=2**-15, xmin_max=256, xmax_init=2**0, xmax_min=2**-8, xmax_max=256, fix_parameters=False): """Parametric version of `pow2_quantize` where the min value `xmin` and max value `xmax` are learnable parameters. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2.**F.round(F.log(F.abs(v)) / np.log(2.)) xmin = get_parameter_or_create("xmin", (), ConstantInitializer(xmin_init), need_grad=True, as_need_grad=not fix_parameters) xmax = get_parameter_or_create("xmax", (), ConstantInitializer(xmax_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that minimum dynamic range is in specified range and a power-of-two xmin = quantize_pow2(clip_scalar(xmin, xmin_min, xmin_max)) # ensure that minimum dynamic range is in specified range and a power-of-two xmax = quantize_pow2(clip_scalar(xmax, xmax_min, xmax_max)) # broadcast variables to correct size xmin = broadcast_scalar(xmin, shape=x.shape) xmax = broadcast_scalar(xmax, shape=x.shape) # if unsigned, then quantize all negative values to zero if not sign: x = F.relu(x) # compute absolute value/sign of input ax = F.abs(x) sx = F.sign(x) if with_zero: # prune smallest elements (in magnitude) to zero if they are smaller # than `x_min / \sqrt(2)` x_threshold = xmin / np.sqrt(2) idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, xmin) idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax) idx3 = F.greater_equal(ax, xmax) else: idx1 = F.less(ax, xmin) idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax) idx3 = F.greater_equal(ax, xmax) # do not backpropagate gradient through indices idx1.need_grad = False idx2.need_grad = False idx3.need_grad = False # do not backpropagate gradient through sign sx.need_grad = False # take care of values outside of dynamic range return sx * (xmin * idx1 + quantize_pow2(ax) * idx2 + xmax * idx3)
def parametric_pow2_quantize(x, sign=True, with_zero=True, n_init=8, n_min=1, n_max=16, m_init=1, m_min=-8, m_max=8, fix_parameters=False): """Parametric version of `pow2_quantize` where the bitwidth `n` and dynamic range `m` are learnable parameters. Args: x(~nnabla.Variable): N-D array as input sign (bool): keep sign information during quantization. with_zero (bool): quantize small weights to zero. n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter. n_min (int): lower bound for bitwidth. n_max (int): upper bound for bitwidth. m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range. m_min (float): lower bound for dynamic range. m_max (float): upper bound for dynamic range. fix_parameters (bool): When set to `True`, the negative slope values will not be updated. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(F.abs(v)) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) m = get_parameter_or_create("m", (), ConstantInitializer(m_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n_q = F.round(clip_scalar(n, n_min, n_max)) if sign: n_q = n_q - 1 if with_zero: n_q = n_q - 1 # ensure that dynamic range is in specified range and an integer m_q = F.round(clip_scalar(m, m_min, m_max)) # compute min/max value that we can represent x_max = 2**m_q x_min = 2**(m_q - (2**n_q) + 1) # broadcast variables to correct size x_min = broadcast_scalar(x_min, shape=x.shape) x_max = broadcast_scalar(x_max, shape=x.shape) # if unsigned, then quantize all negative values to zero if not sign: x = F.relu(x) # compute absolute value/sign of input ax = F.abs(x) sx = F.sign(x) if with_zero: # prune smallest elements (in magnitude) to zero if they are smaller # than `x_min / \sqrt(2)` x_threshold = x_min / np.sqrt(2) idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, x_min) idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max) idx3 = F.greater_equal(ax, x_max) else: idx1 = F.less(ax, x_min) idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max) idx3 = F.greater_equal(ax, x_max) # do not backpropagate gradient through indices idx1.need_grad = False idx2.need_grad = False idx3.need_grad = False # do not backpropagate gradient through sign sx.need_grad = False # take care of values outside of dynamic range return sx * (x_min * idx1 + quantize_pow2(ax) * idx2 + x_max * idx3)
def get_mask(x: nn.Variable) -> nn.Variable: assert len(x.shape) == 2 batch_size, max_len = x.shape mask = F.reshape(F.sign(x), shape=(batch_size, max_len, 1)) return mask