示例#1
0
文件: norm.py 项目: sony/nnabla
def norm_backward(inputs, p=None, axes=None, keep_dims=False):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]

    if p is None:
        p = 2.0
    axes = list(range(x0.ndim)) if axes is None else force_list(axes)

    x_abs = F.abs(x0)
    x_pow = F.pow_scalar(x_abs, p)
    x_sum = F.sum(x_pow, axes, keepdims=True)

    # Add axis for mul2
    if not keep_dims:
        shape = list(x0.shape)
        for a in axes:
            shape[a] = 1
        dy = dy.reshape(shape)

    x_sign = no_grad(F.sign(x0))
    dx = dy * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign

    return dx
示例#2
0
def sample_noise(inpt_size, out_size):
    _f = lambda x: F.sign(x) * F.pow_scalar(F.abs(x), 0.5)
    noise = _f(F.randn(shape=(inpt_size + out_size, )))
    eps_w = F.batch_matmul(F.reshape(noise[:inpt_size], (1, -1)),
                           F.reshape(noise[inpt_size:], (1, -1)), True)
    eps_b = noise[inpt_size:]
    return eps_w, eps_b
示例#3
0
    def backward_impl(self, inputs, outputs, prop_down, accum):
        # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or
        # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph]

        # Inputs
        x0 = inputs[0].data
        x1 = inputs[1].data
        dy = inputs[2].data
        # Outputs
        dx0 = outputs[0].data
        dx1 = outputs[1].data
        # Grads of inputs
        g_x0 = inputs[0].grad
        g_x1 = inputs[1].grad
        g_dy = inputs[2].grad
        # Grads of outputs
        g_dx0 = outputs[0].grad
        g_dx1 = outputs[1].grad

        # Computation
        if prop_down[2]:
            sign = F.sign(x0 - x1, 1.0)
            if accum[2]:
                g_dy += sign * (g_dx0 - g_dx1)
            else:
                g_dy.copy_from(sign * (g_dx0 - g_dx1))
示例#4
0
def build_model():
    x = nn.Variable((batch_size, sentence_length_source))
    mask = get_mask(x)
    y = nn.Variable((batch_size, sentence_length_target))

    enc_input = time_distributed(PF.embed)(
        x, vocab_size_source, embedding_size, name='enc_embeddings') * mask
    # -> (batch_size, sentence_length_source, embedding_size)

    dec_input = F.concatenate(F.constant(w2i_target['<bos>'],
                                         shape=(batch_size, 1)),
                              y[:, :sentence_length_target - 1],
                              axis=1)

    dec_input = time_distributed(PF.embed)(dec_input,
                                           vocab_size_target,
                                           embedding_size,
                                           name='dec_embeddings')
    # -> (batch_size, sentence_length_target, embedding_size)

    # encoder
    with nn.parameter_scope('encoder'):
        enc_output, c, h = lstm(enc_input,
                                hidden,
                                mask=mask,
                                return_sequences=True,
                                return_state=True)
        # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden)

    # decoder
    with nn.parameter_scope('decoder'):
        dec_output = lstm(dec_input,
                          hidden,
                          initial_state=(c, h),
                          return_sequences=True)
        # -> (batch_size, sentence_length_target, hidden)

        attention_output = global_attention(dec_output,
                                            enc_output,
                                            mask=mask,
                                            score='dot')
        # -> (batch_size, sentence_length_target, hidden)

    output = F.concatenate(dec_output, attention_output, axis=2)

    output = time_distributed(PF.affine)(output,
                                         vocab_size_target,
                                         name='output')
    # -> (batch_size, sentence_length_target, vocab_size_target)

    t = F.reshape(y, (batch_size, sentence_length_target, 1))

    entropy = time_distributed_softmax_cross_entropy(output, t)

    mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
    count = F.sum(mask, axis=1)

    entropy *= mask
    loss = F.mean(F.sum(entropy, axis=1) / count)
    return x, y, loss
示例#5
0
def norm_normalization_backward(inputs, p=None, axes=None, eps=1e-12):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]

    if p is None:
        p = 2.0
    axes = list(range(x0.ndim)) if axes is None else force_list(axes)

    x_abs = F.abs(x0)
    x_pow = F.pow_scalar(x_abs, p)
    x_sum = F.sum(x_pow, axes, keepdims=True)
    # x_norm = x_sum ** (1./p)

    # Div2 backward
    dx = dy * x_sum**(-1. / p)
    dx_norm = -dy * x0 * x_sum**(-2. / p)
    dx_norm = sum_for_arithmetics(dx_norm, x_sum)

    # Norm backward
    x_sign = no_grad(F.sign(x0))
    dx += dx_norm * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign

    return dx
示例#6
0
def build_model():
    x = nn.Variable((batch_size, sentence_length_source))
    input_mask = F.sign(
        F.reshape(F.slice(x), (batch_size, sentence_length_source, 1)))
    y = nn.Variable((batch_size, sentence_length_target))

    enc_input = time_distributed(PF.embed)(x,
                                           vocab_size_source,
                                           embedding_size,
                                           name='enc_embeddings')  #*input_mask
    # -> (batch_size, sentence_length_source, embedding_size)
    dec_input = time_distributed(PF.embed)(y,
                                           vocab_size_target,
                                           embedding_size,
                                           name='dec_embeddings')
    # -> (batch_size, sentence_length_target, embedding_size)

    # encoder
    with nn.parameter_scope('encoder'):
        output, c, h = LSTMEncoder(enc_input,
                                   hidden,
                                   return_sequences=True,
                                   return_state=True)
        # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden)

    # decoder
    output = LSTMAttentionDecoder(dec_input,
                                  output,
                                  initial_state=(c, h),
                                  return_sequences=True,
                                  name='decoder')
    # -> (batch_size, sentence_length_target, hidden)
    output = time_distributed(PF.affine)(output,
                                         vocab_size_target,
                                         name='output')
    # -> (batch_size, sentence_length_target, vocab_size_target)

    t = F.reshape(F.slice(y), (batch_size, sentence_length_target, 1))

    entropy = time_distributed_softmax_cross_entropy(output, t)

    mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
    count = F.sum(mask, axis=1)

    entropy *= mask
    loss = F.mean(F.sum(entropy, axis=1) / count)
    return x, y, loss
示例#7
0
def combination(sample_num, choise_num):
    x = F.rand(shape=(sample_num, ))
    x_indices = nn.Variable.from_numpy_array(np.arange(sample_num, ) + 1)
    y_top_k = F.top_k_data(x, k=choise_num, reduce=False, base_axis=0)
    y_top_k_sign = F.sign(y_top_k, alpha=0)
    y_top_k_indices = F.top_k_data(y_top_k_sign * x_indices,
                                   k=choise_num,
                                   base_axis=0)
    return y_top_k_indices
示例#8
0
def build_model(train=True, get_embeddings=False):
    x = nn.Variable((batch_size, sentence_length, ptb_dataset.word_length))
    mask = expand_dims(F.sign(x), axis=-1)
    t = nn.Variable((batch_size, sentence_length))

    with nn.parameter_scope('char_embedding'):
        h = PF.embed(x, char_vocab_size, char_embedding_dim) * mask
    h = F.transpose(h, (0, 3, 1, 2))
    output = []
    for f, f_size in zip(filters, filster_sizes):
        _h = PF.convolution(h, f, kernel=(1, f_size), pad=(0, f_size//2), name='conv_{}'.format(f_size))
        _h = F.max_pooling(_h, kernel=(1, ptb_dataset.word_length))
        output.append(_h)
    h = F.concatenate(*output, axis=1)
    h = F.transpose(h, (0, 2, 1, 3))

    mask = get_mask(F.sum(x, axis=2))
    embeddings = F.reshape(h, (batch_size, sentence_length, sum(filters))) * mask

    if get_embeddings:
        return x, embeddings

    with nn.parameter_scope('highway1'):
        h = time_distributed(highway)(embeddings)
    with nn.parameter_scope('highway2'):
        h = time_distributed(highway)(h)
    with nn.parameter_scope('lstm1'):
        h = lstm(h, lstm_size, mask=mask, return_sequences=True)
    with nn.parameter_scope('lstm2'):
        h = lstm(h, lstm_size, mask=mask, return_sequences=True)
    with nn.parameter_scope('hidden'):
        h = F.relu(time_distributed(PF.affine)(h, lstm_size))
    if train:
        h = F.dropout(h, p=dropout_ratio)
    with nn.parameter_scope('output'):
        y = time_distributed(PF.affine)(h, word_vocab_size)

    mask = F.sign(t) # do not predict 'pad'.
    entropy = time_distributed_softmax_cross_entropy(y, expand_dims(t, axis=-1)) * mask
    count = F.sum(mask, axis=1)
    loss = F.mean(F.div2(F.sum(entropy, axis=1), count))
    return x, t, loss
示例#9
0
def my_less_scalar(_x, _scalar):
    # input
    # _x : type=nn.Variable
    # _scalar : type=float
    # output
    # flags : type=nn.Variable, same shape with _x

    temp = F.r_sub_scalar(_x, _scalar)
    temp = F.sign(temp, alpha=0)
    flags = F.relu(temp)
    return flags
示例#10
0
def get_mask(x: nn.Variable) -> nn.Variable:
    assert len(x.shape) == 2
    batch_size, max_len = x.shape
    mask = expand_dims(F.sign(x), axis=-1)
    return mask
                                       shuffle=True,
                                       with_file_cache=False)
valid_data_iter = data_iterator_simple(load_valid_func,
                                       len(x_valid),
                                       batch_size,
                                       shuffle=True,
                                       with_file_cache=False)

x = nn.Variable((batch_size, sentence_length))
t = nn.Variable((batch_size, sentence_length, 1))
h = PF.embed(x, vocab_size, embedding_size)
h = LSTM(h, hidden, return_sequences=True)
h = TimeDistributed(PF.affine)(h, hidden, name='hidden')
y = TimeDistributed(PF.affine)(h, vocab_size, name='output')

mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask
count = F.sum(mask, axis=1)
loss = F.mean(F.div2(F.sum(entropy, axis=1), count))

# Create solver.
solver = S.Momentum(1e-2, momentum=0.9)
solver.set_parameters(nn.get_parameters())

# Create monitor.
from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed
monitor = Monitor('./tmp-lstmlm')
monitor_perplexity = MonitorSeries('perplexity', monitor, interval=1)
monitor_perplexity_valid = MonitorSeries('perplexity_valid',
                                         monitor,
                                         interval=1)
def one_hot_combination(sample_num, choise_num):
    x = F.rand(shape=(sample_num, ))
    y_top_k = F.top_k_data(x, k=choise_num, reduce=False, base_axis=0)
    y_top_k_sign = F.sign(y_top_k, alpha=0)
    return y_top_k_sign
def parametric_pow2_quantize_xmin_xmax(x,
                                       sign=True,
                                       with_zero=True,
                                       xmin_init=2**-7,
                                       xmin_min=2**-15,
                                       xmin_max=256,
                                       xmax_init=2**0,
                                       xmax_min=2**-8,
                                       xmax_max=256,
                                       fix_parameters=False):
    """Parametric version of `pow2_quantize` where the
    min value `xmin` and max value `xmax` are learnable parameters.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2.**F.round(F.log(F.abs(v)) / np.log(2.))

    xmin = get_parameter_or_create("xmin", (),
                                   ConstantInitializer(xmin_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)
    xmax = get_parameter_or_create("xmax", (),
                                   ConstantInitializer(xmax_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)

    # ensure that minimum dynamic range is in specified range and a power-of-two
    xmin = quantize_pow2(clip_scalar(xmin, xmin_min, xmin_max))

    # ensure that minimum dynamic range is in specified range and a power-of-two
    xmax = quantize_pow2(clip_scalar(xmax, xmax_min, xmax_max))

    # broadcast variables to correct size
    xmin = broadcast_scalar(xmin, shape=x.shape)
    xmax = broadcast_scalar(xmax, shape=x.shape)

    # if unsigned, then quantize all negative values to zero
    if not sign:
        x = F.relu(x)

    # compute absolute value/sign of input
    ax = F.abs(x)
    sx = F.sign(x)

    if with_zero:
        # prune smallest elements (in magnitude) to zero if they are smaller
        # than `x_min / \sqrt(2)`
        x_threshold = xmin / np.sqrt(2)

        idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, xmin)
        idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax)
        idx3 = F.greater_equal(ax, xmax)
    else:
        idx1 = F.less(ax, xmin)
        idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax)
        idx3 = F.greater_equal(ax, xmax)

    # do not backpropagate gradient through indices
    idx1.need_grad = False
    idx2.need_grad = False
    idx3.need_grad = False

    # do not backpropagate gradient through sign
    sx.need_grad = False

    # take care of values outside of dynamic range
    return sx * (xmin * idx1 + quantize_pow2(ax) * idx2 + xmax * idx3)
def parametric_pow2_quantize(x,
                             sign=True,
                             with_zero=True,
                             n_init=8,
                             n_min=1,
                             n_max=16,
                             m_init=1,
                             m_min=-8,
                             m_max=8,
                             fix_parameters=False):
    """Parametric version of `pow2_quantize` where the
    bitwidth `n` and dynamic range `m` are learnable parameters.

    Args:
        x(~nnabla.Variable): N-D array as input
        sign (bool): keep sign information during quantization.
        with_zero (bool): quantize small weights to zero.
        n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter.
        n_min (int): lower bound for bitwidth.
        n_max (int): upper bound for bitwidth.
        m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range.
        m_min (float): lower bound for dynamic range.
        m_max (float): upper bound for dynamic range.
        fix_parameters (bool): When set to `True`, the negative slope values
            will not be updated.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(F.abs(v)) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    m = get_parameter_or_create("m", (),
                                ConstantInitializer(m_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n_q = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n_q = n_q - 1
    if with_zero:
        n_q = n_q - 1

    # ensure that dynamic range is in specified range and an integer
    m_q = F.round(clip_scalar(m, m_min, m_max))

    # compute min/max value that we can represent
    x_max = 2**m_q
    x_min = 2**(m_q - (2**n_q) + 1)

    # broadcast variables to correct size
    x_min = broadcast_scalar(x_min, shape=x.shape)
    x_max = broadcast_scalar(x_max, shape=x.shape)

    # if unsigned, then quantize all negative values to zero
    if not sign:
        x = F.relu(x)

    # compute absolute value/sign of input
    ax = F.abs(x)
    sx = F.sign(x)

    if with_zero:
        # prune smallest elements (in magnitude) to zero if they are smaller
        # than `x_min / \sqrt(2)`
        x_threshold = x_min / np.sqrt(2)

        idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, x_min)
        idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max)
        idx3 = F.greater_equal(ax, x_max)
    else:
        idx1 = F.less(ax, x_min)
        idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max)
        idx3 = F.greater_equal(ax, x_max)

    # do not backpropagate gradient through indices
    idx1.need_grad = False
    idx2.need_grad = False
    idx3.need_grad = False

    # do not backpropagate gradient through sign
    sx.need_grad = False

    # take care of values outside of dynamic range
    return sx * (x_min * idx1 + quantize_pow2(ax) * idx2 + x_max * idx3)
示例#15
0
def get_mask(x: nn.Variable) -> nn.Variable:
    assert len(x.shape) == 2
    batch_size, max_len = x.shape
    mask = F.reshape(F.sign(x), shape=(batch_size, max_len, 1))
    return mask