示例#1
0
def drop_path(x):
    """
        The same implementation as PyTorch versions.
        rate: Variable. drop rate. if the random value drawn from
                uniform distribution is less than the drop_rate,
                corresponding element becomes 0.
    """
    drop_prob = nn.parameter.get_parameter_or_create("drop_rate",
                                                     shape=(1, 1, 1, 1), need_grad=False)
    mask = F.rand(shape=(x.shape[0], 1, 1, 1))
    mask = F.greater_equal(mask, drop_prob)
    x = F.div2(x, 1 - drop_prob)
    x = F.mul2(x, mask)
    return x
示例#2
0
def absolute_error_backward(inputs):
    """
    Args:
      inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function.
      kwargs (dict of arguments): Dictionary of the corresponding function arguments.

    Return:
      list of Variable: Return the gradients wrt inputs of the corresponding function.
    """
    dy = inputs[0]
    x0 = inputs[1]
    x1 = inputs[2]

    m0 = F.greater_equal(x0, x1)
    m1 = 1 - m0
    m0 = no_grad(m0)
    m1 = no_grad(m1)
    dx0 = dy * (m0 - m1)
    dx1 = -dx0
    return dx0, dx1
def parametric_pow2_quantize_xmin_xmax(x,
                                       sign=True,
                                       with_zero=True,
                                       xmin_init=2**-7,
                                       xmin_min=2**-15,
                                       xmin_max=256,
                                       xmax_init=2**0,
                                       xmax_min=2**-8,
                                       xmax_max=256,
                                       fix_parameters=False):
    """Parametric version of `pow2_quantize` where the
    min value `xmin` and max value `xmax` are learnable parameters.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2.**F.round(F.log(F.abs(v)) / np.log(2.))

    xmin = get_parameter_or_create("xmin", (),
                                   ConstantInitializer(xmin_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)
    xmax = get_parameter_or_create("xmax", (),
                                   ConstantInitializer(xmax_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)

    # ensure that minimum dynamic range is in specified range and a power-of-two
    xmin = quantize_pow2(clip_scalar(xmin, xmin_min, xmin_max))

    # ensure that minimum dynamic range is in specified range and a power-of-two
    xmax = quantize_pow2(clip_scalar(xmax, xmax_min, xmax_max))

    # broadcast variables to correct size
    xmin = broadcast_scalar(xmin, shape=x.shape)
    xmax = broadcast_scalar(xmax, shape=x.shape)

    # if unsigned, then quantize all negative values to zero
    if not sign:
        x = F.relu(x)

    # compute absolute value/sign of input
    ax = F.abs(x)
    sx = F.sign(x)

    if with_zero:
        # prune smallest elements (in magnitude) to zero if they are smaller
        # than `x_min / \sqrt(2)`
        x_threshold = xmin / np.sqrt(2)

        idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, xmin)
        idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax)
        idx3 = F.greater_equal(ax, xmax)
    else:
        idx1 = F.less(ax, xmin)
        idx2 = F.greater_equal(ax, xmin) * F.less(ax, xmax)
        idx3 = F.greater_equal(ax, xmax)

    # do not backpropagate gradient through indices
    idx1.need_grad = False
    idx2.need_grad = False
    idx3.need_grad = False

    # do not backpropagate gradient through sign
    sx.need_grad = False

    # take care of values outside of dynamic range
    return sx * (xmin * idx1 + quantize_pow2(ax) * idx2 + xmax * idx3)
def parametric_pow2_quantize(x,
                             sign=True,
                             with_zero=True,
                             n_init=8,
                             n_min=1,
                             n_max=16,
                             m_init=1,
                             m_min=-8,
                             m_max=8,
                             fix_parameters=False):
    """Parametric version of `pow2_quantize` where the
    bitwidth `n` and dynamic range `m` are learnable parameters.

    Args:
        x(~nnabla.Variable): N-D array as input
        sign (bool): keep sign information during quantization.
        with_zero (bool): quantize small weights to zero.
        n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter.
        n_min (int): lower bound for bitwidth.
        n_max (int): upper bound for bitwidth.
        m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range.
        m_min (float): lower bound for dynamic range.
        m_max (float): upper bound for dynamic range.
        fix_parameters (bool): When set to `True`, the negative slope values
            will not be updated.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(F.abs(v)) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    m = get_parameter_or_create("m", (),
                                ConstantInitializer(m_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n_q = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n_q = n_q - 1
    if with_zero:
        n_q = n_q - 1

    # ensure that dynamic range is in specified range and an integer
    m_q = F.round(clip_scalar(m, m_min, m_max))

    # compute min/max value that we can represent
    x_max = 2**m_q
    x_min = 2**(m_q - (2**n_q) + 1)

    # broadcast variables to correct size
    x_min = broadcast_scalar(x_min, shape=x.shape)
    x_max = broadcast_scalar(x_max, shape=x.shape)

    # if unsigned, then quantize all negative values to zero
    if not sign:
        x = F.relu(x)

    # compute absolute value/sign of input
    ax = F.abs(x)
    sx = F.sign(x)

    if with_zero:
        # prune smallest elements (in magnitude) to zero if they are smaller
        # than `x_min / \sqrt(2)`
        x_threshold = x_min / np.sqrt(2)

        idx1 = F.greater_equal(ax, x_threshold) * F.less(ax, x_min)
        idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max)
        idx3 = F.greater_equal(ax, x_max)
    else:
        idx1 = F.less(ax, x_min)
        idx2 = F.greater_equal(ax, x_min) * F.less(ax, x_max)
        idx3 = F.greater_equal(ax, x_max)

    # do not backpropagate gradient through indices
    idx1.need_grad = False
    idx2.need_grad = False
    idx3.need_grad = False

    # do not backpropagate gradient through sign
    sx.need_grad = False

    # take care of values outside of dynamic range
    return sx * (x_min * idx1 + quantize_pow2(ax) * idx2 + x_max * idx3)