def parametric_fixed_point_quantize_b_xmax(x,
                                           sign=True,
                                           n_init=8,
                                           n_min=2,
                                           n_max=16,
                                           xmax_init=1,
                                           xmax_min=0.001,
                                           xmax_max=10,
                                           fix_parameters=False):
    """Parametric version of `fixed_point_quantize` where the
    bitwidth `b` and dynamic range `xmax` are learnable parameters.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(v) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    xmax = get_parameter_or_create("xmax", (),
                                   ConstantInitializer(xmax_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n = n - 1

    # ensure that dynamic range is in specified range
    xmax = clip_scalar(xmax, xmax_min, xmax_max)

    # compute step size from dynamic range and make sure that it is a pow2
    d = quantize_pow2(xmax / (2**n - 1))

    # compute min/max value that we can represent
    if sign:
        xmin = -xmax
    else:
        xmin = nn.Variable((1, ), need_grad=False)
        xmin.d = 0.

    # broadcast variables to correct size
    d = broadcast_scalar(d, shape=x.shape)
    xmin = broadcast_scalar(xmin, shape=x.shape)
    xmax = broadcast_scalar(xmax, shape=x.shape)

    # apply fixed-point quantization
    return d * F.round(F.clip_by_value(x, xmin, xmax) / d)
Exemplo n.º 2
0
def transformer(train=True, droput_ratio=0.1):
    x = nn.Variable((batch_size, max_len))
    t = nn.Variable((batch_size, 1))
    mask = get_mask(x)
    with nn.parameter_scope('embedding_layer'):
        # h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
        h = token_embedding(x, vocab_size, embedding_size)
    h = position_encoding(h)

    if train:
        h = F.dropout(h, p=droput_ratio)

    for i in range(hopping_num):
        with nn.parameter_scope(f'encoder_hopping_{i}'):
            h = residual_normalization_wrapper(multihead_self_attention)(
                h,
                head_num,
                mask=mask,
                train=train,
                dropout_ratio=droput_ratio)
            h = residual_normalization_wrapper(positionwise_feed_forward)(
                h, train=train, dropout_ratio=droput_ratio)

    with nn.parameter_scope('output_layer'):
        y = F.sigmoid(PF.affine(h[:, 0, :], 1))

    accuracy = F.mean(F.equal(F.round(y), t))
    loss = F.mean(F.binary_cross_entropy(y, t))

    return x, y, t, accuracy, loss
Exemplo n.º 3
0
def build_self_attention_model(train=True):
    x = nn.Variable((batch_size, max_len))
    t = nn.Variable((batch_size, 1))
    mask = get_mask(x)
    attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant(
        np.finfo(np.float32).min, shape=mask.shape)
    with nn.parameter_scope('embedding'):
        h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
    with nn.parameter_scope('forward'):
        h_f = lstm(h,
                   hidden_size,
                   mask=mask,
                   return_sequences=True,
                   return_state=False)
    with nn.parameter_scope('backward'):
        h_b = lstm(h[:, ::-1, ],
                   hidden_size,
                   mask=mask,
                   return_sequences=True,
                   return_state=False)[:, ::-1, ]
    h = F.concatenate(h_f, h_b, axis=2)
    if train:
        h = F.dropout(h, p=dropout_ratio)
    with nn.parameter_scope('da'):
        a = F.tanh(time_distributed(PF.affine)(h, da))
        if train:
            a = F.dropout(a, p=dropout_ratio)
    with nn.parameter_scope('r'):
        a = time_distributed(PF.affine)(a, r)
        if train:
            a = F.dropout(a, p=dropout_ratio)
        a = F.softmax(a + attention_mask, axis=1)
    m = F.batch_matmul(a, h, transpose_a=True)
    with nn.parameter_scope('output_mlp'):
        output = F.relu(PF.affine(m, output_mlp_size))
        if train:
            output = F.dropout(output, p=dropout_ratio)
    with nn.parameter_scope('output'):
        y = F.sigmoid(PF.affine(output, 1))

    accuracy = F.mean(F.equal(F.round(y), t))
    loss = F.mean(F.binary_cross_entropy(
        y, t)) + attention_penalty_coef * frobenius(
            F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r))
    return x, t, accuracy, loss
Exemplo n.º 4
0
def quantize_pow2(v):
    return 2**F.round(F.log(v) / np.log(2.))
Exemplo n.º 5
0
def network_size_activations():
    """
    Returns total number of activations
    and size in KBytes (NNabla variable using `max` or `sum` operator)
    """
    kbytes = []
    num_activations = 0

    # get all parameters
    ps = nn.get_parameters(grad_only=False)
    for p in ps:
        if "Asize" in p:
            print(f"{p}\t{ps[p].d}")

            num_activations += ps[p].d

            if cfg.a_quantize is not None:
                if cfg.a_quantize in ['fp_relu', 'pow2_relu']:
                    # fixed quantization
                    n = nn.Variable((), need_grad=False)
                    n.d = cfg.a_bitwidth
                elif cfg.a_quantize in [
                        'parametric_fp_relu', 'parametric_fp_b_xmax_relu',
                        'parametric_fp_d_b_relu',
                        'parametric_pow2_b_xmax_relu',
                        'parametric_pow2_b_xmin_relu'
                ]:
                    # parametric quantization
                    s = p.replace(
                        "/Asize", "/Aquant/" +
                        cfg.a_quantize.replace("_relu", "") + "/n")
                    n = F.round(
                        clip_scalar(ps[s], cfg.a_bitwidth_min,
                                    cfg.a_bitwidth_max))
                elif cfg.a_quantize in ['parametric_fp_d_xmax_relu']:
                    # these quantization methods do not have n, so we need to compute it!
                    # parametric quantization
                    d = ps[p.replace(
                        "/Asize", "/Aquant/" +
                        cfg.a_quantize.replace("_relu", "") + "/d")]
                    xmax = ps[p.replace(
                        "/Asize", "/Aquant/" +
                        cfg.a_quantize.replace("_relu", "") + "/xmax")]

                    # ensure that stepsize is in specified range and a power of two
                    d_q = quantize_pow2(
                        clip_scalar(d, cfg.a_stepsize_min, cfg.a_stepsize_max))

                    # ensure that dynamic range is in specified range
                    xmax = clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max)

                    # compute real `xmax`
                    xmax = F.round(xmax / d_q) * d_q

                    n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0)),
                                         cfg.a_bitwidth_min)
                elif cfg.a_quantize in ['parametric_pow2_xmin_xmax_relu']:
                    # these quantization methods do not have n, so we need to compute it!
                    # parametric quantization
                    xmin = ps[p.replace(
                        "/Asize", "/Aquant/" +
                        cfg.a_quantize.replace("_relu", "") + "/xmin")]
                    xmax = ps[p.replace(
                        "/Asize", "/Aquant/" +
                        cfg.a_quantize.replace("_relu", "") + "/xmax")]

                    # ensure that dynamic ranges are in specified range and a power-of-two
                    xmin = quantize_pow2(
                        clip_scalar(xmin, cfg.a_xmin_min, cfg.a_xmin_max))
                    xmax = quantize_pow2(
                        clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max))

                    # use ceil rounding
                    n = F.maximum_scalar(
                        F.ceil(log2(log2(xmax / xmin) + 1.) + 1.),
                        cfg.a_bitwidth_min)
                else:
                    raise ValueError("Unknown quantization method {}".format(
                        cfg.a_quantize))
            else:
                # float precision
                n = nn.Variable((), need_grad=False)
                n.d = 32.

            kbytes.append(
                F.reshape(n * ps[p].d / 8. / 1024., (1, ), inplace=False))

    if cfg.target_activation_type == 'max':
        _kbytes = F.max(F.concatenate(*kbytes))
    elif cfg.target_activation_type == 'sum':
        _kbytes = F.sum(F.concatenate(*kbytes))
    return num_activations, _kbytes
Exemplo n.º 6
0
def network_size_weights():
    """
    Return total number of weights and network size (for weights) in KBytes
    """
    kbytes = None
    num_params = None

    # get all parameters
    ps = nn.get_parameters()
    for p in ps:
        if ((p.endswith("quantized_conv/W") or p.endswith("quantized_conv/b")
             or p.endswith("quantized_affine/W")
             or p.endswith("quantized_affine/b"))):
            _num_params = np.prod(ps[p].shape)
            print(f"{p}\t{ps[p].shape}\t{_num_params}")

            if cfg.w_quantize is not None:
                if cfg.w_quantize in [
                        'parametric_fp_b_xmax', 'parametric_fp_d_b',
                        'parametric_pow2_b_xmax', 'parametric_pow2_b_xmin'
                ]:
                    # parametric quantization
                    n_p = p + "quant/" + cfg.w_quantize + "/n"
                    n = F.round(
                        clip_scalar(ps[n_p], cfg.w_bitwidth_min,
                                    cfg.w_bitwidth_max))
                elif cfg.w_quantize == 'parametric_fp_d_xmax':
                    # this quantization methods do not have n, so we need to compute it
                    d = ps[p + "quant/" + cfg.w_quantize + "/d"]
                    xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"]

                    # ensure that stepsize is in specified range and a power of two
                    d_q = quantize_pow2(
                        clip_scalar(d, cfg.w_stepsize_min, cfg.w_stepsize_max))

                    # ensure that dynamic range is in specified range
                    xmax = clip_scalar(xmax, cfg.w_xmax_min, cfg.w_xmax_max)

                    # compute real `xmax`
                    xmax = F.round(xmax / d_q) * d_q

                    # we do not clip to `cfg.w_bitwidth_max` as xmax/d_q could correspond to more than 8 bit
                    n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0) + 1.0),
                                         cfg.w_bitwidth_min)
                elif cfg.w_quantize == 'parametric_pow2_xmin_xmax':
                    # this quantization methods do not have n, so we need to compute it
                    xmin = ps[p + "quant/" + cfg.w_quantize + "/xmin"]
                    xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"]

                    # ensure that minimum dynamic range is in specified range and a power-of-two
                    xmin = quantize_pow2(
                        clip_scalar(xmin, cfg.w_xmin_min, cfg.w_xmin_max))

                    # ensure that maximum dynamic range is in specified range and a power-of-two
                    xmax = quantize_pow2(
                        clip_scalar(xmax, cfg.w_xmax_min, cfg.w_xmax_max))

                    # use ceil to determine bitwidth
                    n = F.maximum_scalar(
                        F.ceil(log2(log2(xmax / xmin) + 1.0) + 1.),
                        cfg.w_bitwidth_min)
                elif cfg.w_quantize == 'fp' or cfg.w_quantize == 'pow2':
                    # fixed quantization
                    n = nn.Variable((), need_grad=False)
                    n.d = cfg.w_bitwidth
                else:
                    raise ValueError(
                        f'Unknown quantization method {cfg.w_quantize}')
            else:
                # float precision
                n = nn.Variable((), need_grad=False)
                n.d = 32.

            if kbytes is None:
                kbytes = n * _num_params / 8. / 1024.
                num_params = _num_params
            else:
                kbytes += n * _num_params / 8. / 1024.
                num_params += _num_params
    return num_params, kbytes
Exemplo n.º 7
0
def global_average_pooling_1d(x, mask):
    count = F.sum(mask, axis=1)
    global_average_pooled = F.sum(h, axis=1) / count
    return global_average_pooled


x = nn.Variable((batch_size, max_len))
t = nn.Variable((batch_size, 1))
mask = get_mask(x)
with nn.parameter_scope('embedding'):
    h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
h = global_average_pooling_1d(h, mask)
with nn.parameter_scope('output'):
    y = F.sigmoid(PF.affine(h, 1))

accuracy = F.mean(F.equal(F.round(y), t))
loss = F.mean(F.binary_cross_entropy(y, t))

# Create solver.
solver = S.Adam()
solver.set_parameters(nn.get_parameters())

trainer = Trainer(inputs=[x, t],
                  loss=loss,
                  metrics={
                      'cross entropy': loss,
                      'accuracy': accuracy
                  },
                  solver=solver)
trainer.run(train_data_iter, dev_data_iter, epochs=5, verbose=1)
 def quantize_pow2(v):
     return 2.**F.round(F.log(F.abs(v)) / np.log(2.))
def parametric_pow2_quantize_b_xmin(x,
                                    sign=True,
                                    with_zero=True,
                                    n_init=8,
                                    n_min=1,
                                    n_max=8,
                                    xmin_init=2**-7,
                                    xmin_min=2**-15,
                                    xmin_max=256,
                                    fix_parameters=False):
    """Parametric version of `pow2_quantize` where the
    bitwidth `n` and the smallest value `xmin` are learnable parameters.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(F.abs(v)) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    xmin = get_parameter_or_create("xmin", (),
                                   ConstantInitializer(xmin_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n = n - 1
    if with_zero:
        n = n - 1

    # ensure that minimum dynamic range is in specified range and a power-of-two
    xmin = quantize_pow2(clip_scalar(xmin, xmin_min, xmin_max))

    # compute min/max value that we can represent
    xmax = xmin * (2**((2**n) - 1))

    # broadcast variables to correct size
    xmin = broadcast_scalar(xmin, shape=x.shape)
    xmax = broadcast_scalar(xmax, shape=x.shape)

    # if unsigned, then quantize all negative values to zero
    if not sign:
        x = F.relu(x)

    # compute absolute value/sign of input
    ax = F.abs(x)
    sx = F.sign(x)

    if with_zero:
        # prune smallest elements (in magnitude) to zero if they are smaller
        # than `x_min / \sqrt(2)`
        x_threshold = xmin / np.sqrt(2)

        idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, xmin)
        idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax)
        idx3 = F.greater_equal(ax, xmax)
    else:
        idx1 = F.less(ax, xmin)
        idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax)
        idx3 = F.greater_equal(ax, xmax)

    # do not backpropagate gradient through indices
    idx1.need_grad = False
    idx2.need_grad = False
    idx3.need_grad = False

    # do not backpropagate gradient through sign
    sx.need_grad = False

    # take care of values outside of dynamic range
    return sx * (xmin * idx1 + quantize_pow2(ax) * idx2 + xmax * idx3)
def parametric_pow2_quantize(x,
                             sign=True,
                             with_zero=True,
                             n_init=8,
                             n_min=1,
                             n_max=16,
                             m_init=1,
                             m_min=-8,
                             m_max=8,
                             fix_parameters=False):
    """Parametric version of `pow2_quantize` where the
    bitwidth `n` and dynamic range `m` are learnable parameters.

    Args:
        x(~nnabla.Variable): N-D array as input
        sign (bool): keep sign information during quantization.
        with_zero (bool): quantize small weights to zero.
        n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter.
        n_min (int): lower bound for bitwidth.
        n_max (int): upper bound for bitwidth.
        m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range.
        m_min (float): lower bound for dynamic range.
        m_max (float): upper bound for dynamic range.
        fix_parameters (bool): When set to `True`, the negative slope values
            will not be updated.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(F.abs(v)) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    m = get_parameter_or_create("m", (),
                                ConstantInitializer(m_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n_q = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n_q = n_q - 1
    if with_zero:
        n_q = n_q - 1

    # ensure that dynamic range is in specified range and an integer
    m_q = F.round(clip_scalar(m, m_min, m_max))

    # compute min/max value that we can represent
    x_max = 2**m_q
    x_min = 2**(m_q - (2**n_q) + 1)

    # broadcast variables to correct size
    x_min = broadcast_scalar(x_min, shape=x.shape)
    x_max = broadcast_scalar(x_max, shape=x.shape)

    # if unsigned, then quantize all negative values to zero
    if not sign:
        x = F.relu(x)

    # compute absolute value/sign of input
    ax = F.abs(x)
    sx = F.sign(x)

    if with_zero:
        # prune smallest elements (in magnitude) to zero if they are smaller
        # than `x_min / \sqrt(2)`
        x_threshold = x_min / np.sqrt(2)

        idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, x_min)
        idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max)
        idx3 = F.greater_equal(ax, x_max)
    else:
        idx1 = F.less(ax, x_min)
        idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max)
        idx3 = F.greater_equal(ax, x_max)

    # do not backpropagate gradient through indices
    idx1.need_grad = False
    idx2.need_grad = False
    idx3.need_grad = False

    # do not backpropagate gradient through sign
    sx.need_grad = False

    # take care of values outside of dynamic range
    return sx * (x_min * idx1 + quantize_pow2(ax) * idx2 + x_max * idx3)
def parametric_fixed_point_quantize(x,
                                    sign=True,
                                    n_init=8,
                                    n_min=2,
                                    n_max=16,
                                    m_init=1,
                                    m_min=-8,
                                    m_max=8,
                                    fix_parameters=False):
    """Parametric version of `fixed_point_quantize` where the
    bitwidth `n` and dynamic range `m` are learnable parameters.

    Args:
        x(~nnabla.Variable): N-D array as input
        sign (bool): keep sign information during quantization.
        n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter.
        n_min (int): lower bound for bitwidth.
        n_max (int): upper bound for bitwidth.
        m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range.
        m_min (float): lower bound for dynamic range.
        m_max (float): upper bound for range.
        fix_parameters (bool): When set to `True`, the negative slope values
            will not be updated.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(v) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    m = get_parameter_or_create("m", (),
                                ConstantInitializer(m_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n_q = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n_q = n_q - 1

    # ensure that dynamic range is in specified range
    m_q = clip_scalar(m, m_min, m_max)

    # compute step size from dynamic range and make sure that it is a pow2
    d_q = quantize_pow2((2**m_q) / (2**n_q - 1))

    # compute min/max value that we can represent
    x_max = d_q * (2**n_q - 1)
    if sign:
        x_min = -x_max
    else:
        x_min = nn.Variable((1, ), need_grad=False)
        x_min.d = 0.

    # broadcast variables to correct size
    d_q = broadcast_scalar(d_q, shape=x.shape)
    x_min = broadcast_scalar(x_min, shape=x.shape)
    x_max = broadcast_scalar(x_max, shape=x.shape)

    # apply fixed-point quantization
    return d_q * F.round(F.clip_by_value(x, x_min, x_max) / d_q)