示例#1
0
def test_clip_by_value_forward(seed, shape, dtype):
    def convert(value):
        converter = dtype if dtype in (float,
                                       np.array) else dtype.from_numpy_array
        return converter(value)

    rng = np.random.RandomState(seed)
    x_data = rng.randn(*shape)
    x = nn.Variable.from_numpy_array(x_data)
    if dtype is float:
        min_data = rng.randn()
        max_data = rng.randn()
    else:
        min_data = rng.randn(*shape)
        max_data = rng.randn(*shape)
    min_ = convert(min_data)
    max_ = convert(max_data)

    if dtype is not np.array:
        with nn.auto_forward(True):
            y = F.clip_by_value(x, min_, max_)
        y_ref = ref_clip_by_value(x_data, min_data, max_data)

        if dtype in (nn.Variable, float):
            assert_allclose(y.d, y_ref)
        elif dtype is nn.NdArray:
            assert_allclose(y.data, y_ref)
    else:
        with pytest.raises(TypeError):
            y = F.clip_by_value(x, min_data, max_data)
def parametric_fixed_point_quantize_b_xmax(x,
                                           sign=True,
                                           n_init=8,
                                           n_min=2,
                                           n_max=16,
                                           xmax_init=1,
                                           xmax_min=0.001,
                                           xmax_max=10,
                                           fix_parameters=False):
    """Parametric version of `fixed_point_quantize` where the
    bitwidth `b` and dynamic range `xmax` are learnable parameters.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(v) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    xmax = get_parameter_or_create("xmax", (),
                                   ConstantInitializer(xmax_init),
                                   need_grad=True,
                                   as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n = n - 1

    # ensure that dynamic range is in specified range
    xmax = clip_scalar(xmax, xmax_min, xmax_max)

    # compute step size from dynamic range and make sure that it is a pow2
    d = quantize_pow2(xmax / (2**n - 1))

    # compute min/max value that we can represent
    if sign:
        xmin = -xmax
    else:
        xmin = nn.Variable((1, ), need_grad=False)
        xmin.d = 0.

    # broadcast variables to correct size
    d = broadcast_scalar(d, shape=x.shape)
    xmin = broadcast_scalar(xmin, shape=x.shape)
    xmax = broadcast_scalar(xmax, shape=x.shape)

    # apply fixed-point quantization
    return d * F.round(F.clip_by_value(x, xmin, xmax) / d)
示例#3
0
def policy_network(obs, action_size, name):
    with nn.parameter_scope(name):
        out = PF.affine(obs, 256, name='fc1')
        out = F.relu(out)
        out = PF.affine(out, 256, name='fc2')
        out = F.relu(out)
        mean = PF.affine(out, action_size, name='mean')
        logstd = PF.affine(out, action_size, name='logstd')
        clipped_logstd = F.clip_by_value(logstd, -20, 2)
    return Normal(mean, F.exp(clipped_logstd))
示例#4
0
 def mask_weight(a, b):
     # much different from definition in the paper
     merged_mask = F.concatenate(a, b, axis=1)
     summed_mask = F.sum((merged_mask + 1) / 2, axis=1, keepdims=True)
     clipped = F.clip_by_value(summed_mask,
                               F.constant(0, shape=summed_mask.shape),
                               F.constant(1, shape=summed_mask.shape))
     z = clipped * 2 - 1
     mask = (1 - z) / 2
     return mask
def test_clip_by_value_forward(seed, shape):
    rng = np.random.RandomState(seed)
    x_data = rng.randn(*shape)
    min_data = rng.randn(*shape)
    max_data = rng.randn(*shape)
    x = nn.Variable.from_numpy_array(x_data)
    min_ = nn.Variable.from_numpy_array(min_data)
    max_ = nn.Variable.from_numpy_array(max_data)
    with nn.auto_forward(True):
        y = F.clip_by_value(x, min_, max_)
    y_ref = ref_clip_by_value(x_data, min_data, max_data)
    assert_allclose(y.d, y_ref)
示例#6
0
 def compute_mel(self, wave):
     hp = self.hparams
     reals, imags = F.stft(wave,
                           window_size=hp.win_length,
                           stride=hp.hop_length,
                           fft_size=hp.n_fft)
     linear = F.pow_scalar(
         F.add2(F.pow_scalar(reals, 2), F.pow_scalar(imags, 2)), 0.5)
     mels = F.batch_matmul(self.basis, linear)
     mels = F.log(F.clip_by_value(mels, 1e-5,
                                  np.inf)).apply(need_grad=False)
     return mels
示例#7
0
    def build_train_graph(self,
                          x,
                          t=None,
                          dropout=0,
                          noise=None,
                          loss_scaling=None):
        B, C, H, W = x.shape
        if self.randflip:
            x = F.random_flip(x)
            assert x.shape == (B, C, H, W)

        if t is None:
            t = F.randint(low=0,
                          high=self.diffusion.num_timesteps,
                          shape=(B, ))
            # F.randint could return high with very low prob. Workaround to avoid this.
            t = F.clip_by_value(t,
                                min=0,
                                max=self.diffusion.num_timesteps - 0.5)

        loss_dict = self.diffusion.train_loss(model=partial(self._denoise,
                                                            dropout=dropout),
                                              x_start=x,
                                              t=t,
                                              noise=noise)
        assert isinstance(loss_dict, AttrDict)

        # setup training loss
        loss_dict.batched_loss = loss_dict.mse
        if is_learn_sigma(self.model_var_type):
            assert "vlb" in loss_dict
            loss_dict.batched_loss += loss_dict.vlb * 1e-3
            # todo: implement loss aware sampler

        if loss_scaling is not None and loss_scaling > 1:
            loss_dict.batched_loss *= loss_scaling

        # setup flat training loss
        loss_dict.loss = F.mean(loss_dict.batched_loss)
        assert loss_dict.batched_loss.shape == t.shape == (B, )

        # Keep interval values to compute loss for each quantile
        t.persistent = True
        for v in loss_dict.values():
            v.persistent = True

        return loss_dict, t
示例#8
0
def clip_quant_grads():
    ps = nn.get_parameters(grad_only=False)
    for p in ps:
        if ((p.endswith("quantized_conv/W") or p.endswith("quantized_conv/b")
             or p.endswith("quantized_affine/W")
             or p.endswith("quantized_affine/b"))):

            if cfg.w_quantize == 'parametric_fp_d_xmax':
                d = ps[p + "quant/" + cfg.w_quantize + "/d"]
                xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"]

                d.grad = F.clip_by_value(d.grad, -d.data, d.data)
                xmax.grad = F.clip_by_value(xmax.grad, -d.data, d.data)

            elif cfg.w_quantize == 'parametric_pow2_xmin_xmax':
                xmin = ps[p + "quant/" + cfg.w_quantize + "/xmin"]
                xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"]

                xmin.grad = F.clip_by_value(xmin.grad, -xmin.data, xmin.data)
                xmax.grad = F.clip_by_value(xmax.grad, -xmin.data, xmin.data)

        if 'Asize' in p.split('/'):
            if cfg.a_quantize == 'parametric_fp_d_xmax_relu':
                d = ps[p.replace(
                    "/Asize",
                    "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/d")]
                xmax = ps[p.replace(
                    "/Asize", "/Aquant/" +
                    cfg.a_quantize.replace("_relu", "") + "/xmax")]

                d.grad = F.clip_by_value(d.grad, -d.data, d.data)
                xmax.grad = F.clip_by_value(xmax.grad, -d.data, d.data)

            elif cfg.a_quantize == 'parametric_pow2_xmin_xmax_relu':
                xmin = ps[p.replace(
                    "/Asize", "/Aquant/" +
                    cfg.a_quantize.replace("_relu", "") + "/xmin")]
                xmax = ps[p.replace(
                    "/Asize", "/Aquant/" +
                    cfg.a_quantize.replace("_relu", "") + "/xmax")]

                xmin.grad = F.clip_by_value(xmin.grad, -xmin.data, xmin.data)
                xmax.grad = F.clip_by_value(xmax.grad, -xmin.data, xmin.data)
示例#9
0
def compute_mel(wave, basis, hp):
    r"""Compute the mel-spectrogram from the waveform.

    Args:
        wave (nn.Variable): Wavefrom variable of shape (B, 1, L).
        basis (nn.Variable): Basis for mel-spectrogram computation.
        hp (HParams): Hyper-parameters.

    Returns:
        nn.Variable: Output variable.
    """
    reals, imags = stft(wave,
                        window_size=hp.win_length,
                        stride=hp.hop_length,
                        fft_size=hp.n_fft)
    linear = (reals**2 + imags**2)**0.5
    mels = F.batch_matmul(basis, linear)
    mels = F.log(F.clip_by_value(mels, 1e-5, np.inf))

    return mels
示例#10
0
    def __call__(self,
                 batch_size,
                 style_noises,
                 truncation_psi=1.0,
                 return_latent=False,
                 mixing_layer_index=None,
                 dlatent_avg_beta=0.995):

        with nn.parameter_scope(self.global_scope):
            # normalize noise inputs
            for i in range(len(style_noises)):
                style_noises[i] = F.div2(
                    style_noises[i],
                    F.pow_scalar(F.add_scalar(F.mean(style_noises[i]**2.,
                                                     axis=1,
                                                     keepdims=True),
                                              1e-8,
                                              inplace=False),
                                 0.5,
                                 inplace=False))

            # get latent code
            w = [
                mapping_network(style_noises[0],
                                outmaps=self.mapping_network_dim,
                                num_layers=self.mapping_network_num_layers)
            ]
            w += [
                mapping_network(style_noises[1],
                                outmaps=self.mapping_network_dim,
                                num_layers=self.mapping_network_num_layers)
            ]

            dlatent_avg = nn.parameter.get_parameter_or_create(
                name="dlatent_avg", shape=(1, 512))

            # Moving average update of dlatent_avg
            batch_avg = F.mean((w[0] + w[1]) * 0.5, axis=0, keepdims=True)
            update_op = F.assign(
                dlatent_avg, lerp(batch_avg, dlatent_avg, dlatent_avg_beta))
            update_op.name = 'dlatent_avg_update'
            dlatent_avg = F.identity(dlatent_avg) + 0 * update_op

            # truncation trick
            w = [lerp(dlatent_avg, _, truncation_psi) for _ in w]

            # generate output from generator
            constant_bc = nn.parameter.get_parameter_or_create(
                name="G_synthesis/4x4/Const/const",
                shape=(1, 512, 4, 4),
                initializer=np.random.randn(1, 512, 4, 4).astype(np.float32))
            constant_bc = F.broadcast(constant_bc,
                                      (batch_size, ) + constant_bc.shape[1:])

            if mixing_layer_index is None:
                mixing_layer_index_var = F.randint(1,
                                                   len(self.resolutions) * 2,
                                                   (1, ))
            else:
                mixing_layer_index_var = F.constant(val=mixing_layer_index,
                                                    shape=(1, ))
            mixing_switch_var = F.clip_by_value(
                F.arange(0,
                         len(self.resolutions) * 2) - mixing_layer_index_var,
                0, 1)
            mixing_switch_var_re = F.reshape(
                mixing_switch_var, (1, mixing_switch_var.shape[0], 1),
                inplace=False)
            w0 = F.reshape(w[0], (batch_size, 1, w[0].shape[1]), inplace=False)
            w1 = F.reshape(w[1], (batch_size, 1, w[0].shape[1]), inplace=False)
            w_mixed = w0 * mixing_switch_var_re + \
                w1 * (1 - mixing_switch_var_re)

            rgb_output = self.synthesis(w_mixed, constant_bc)

            if return_latent:
                return rgb_output, w_mixed
            else:
                return rgb_output
示例#11
0
    def p_mean_var(self, model, x_t, t, clip_denoised=True):
        """
        Compute mean and var of p(x_{t-1}|x_t) from model.

        Args:
            model (Callable): A callbale that takes x_t and t and predict noise (and more).
            x_t (nn.Variable): The (B, C, ...) tensor at timestep t (x_t).
            t (nn.Variable): A 1-D tensor of timesteps. The first axis represents batchsize.
            clip_denoised (bool): If True, clip the denoised signal into [-1, 1].

        Returns:
            An AttrDict containing the following items:
                "mean": the mean predicted by model.
                "var": the variance predicted by model (or pre-defined variance).
                "log_var": the log of "var".
                "xstart": the x_0 predicted from x_t and t by model.
        """
        B, C, H, W = x_t.shape
        assert t.shape == (B, )
        pred = model(x_t, t)

        if self.model_var_type == ModelVarType.LEARNED_RANGE:
            assert pred.shape == (B, 2 * C, H, W)
            pred_noise, pred_var_coeff = chunk(pred, num_chunk=2, axis=1)

            min_log = self._extract(
                self.posterior_log_var_clipped, t, x_t.shape)
            max_log = F.log(self._extract(self.betas, t, x_t.shape))

            # pred_var_coeff should be [0, 1]
            v = F.sigmoid(pred_var_coeff)
            model_log_var = v * max_log + (1 - v) * min_log
            model_var = F.exp(model_log_var)
        else:
            # Model only predicts noise
            pred_noise = pred

            model_log_var, model_var = {
                ModelVarType.FIXED_LARGE: lambda: (
                    self._extract(self.log_betas_clipped, t, x_t.shape),
                    self._extract(self.betas_clipped, t, x_t.shape)
                ),
                ModelVarType.FIXED_SMALL: lambda: (
                    self._extract(
                        self.posterior_log_var_clipped, t, x_t.shape),
                    self._extract(self.posterior_var, t, x_t.shape)
                )
            }[self.model_var_type]()

        x_recon = self.predict_xstart_from_noise(
            x_t=x_t, t=t, noise=pred_noise)

        if clip_denoised:
            x_recon = F.clip_by_value(x_recon, -1, 1)

        model_mean, _, _ = self.q_posterior(x_start=x_recon, x_t=x_t, t=t)

        assert model_mean.shape == x_recon.shape == x_t.shape

        assert model_mean.shape == model_var.shape == model_log_var.shape or \
            (model_mean.shape[0] == model_var.shape[0] == model_log_var.shape[0] and model_var.shape[1:] == (
                1, 1, 1) and model_log_var.shape[1:] == (1, 1, 1))

        # returns
        ret = AttrDict()
        ret.mean = model_mean
        ret.var = model_var
        ret.log_var = model_log_var
        ret.xstart = x_recon

        return ret
示例#12
0
with nn.parameter_scope('central_bias'):
    central_bias = PF.embed(x_central, vocab_size, 1)
with nn.parameter_scope('context_bias'):
    context_bias = PF.embed(x_context, vocab_size, 1)

dot_product = F.reshape(F.batch_matmul(
    F.reshape(central_embedding, shape=(batch_size, 1, embedding_size)),
    F.reshape(context_embedding, shape=(batch_size, embedding_size, 1))),
                        shape=(batch_size, 1))

prediction = dot_product + central_bias + context_bias

t = nn.Variable((batch_size, 1))
zero = F.constant(0, shape=(batch_size, 1))
one = F.constant(1, shape=(batch_size, 1))
weight = F.clip_by_value(t / 100, zero, one)**0.75
loss = F.sum(weight * ((prediction - F.log(t))**2))

# Create solver.
solver = S.Adam()
solver.set_parameters(nn.get_parameters())

# Create monitor
monitor = M.Monitor('./log')
monitor_loss = M.MonitorSeries("Training loss", monitor, interval=1000)
monitor_valid_loss = M.MonitorSeries("Validation loss", monitor, interval=1)
monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=1000)


# Create updater
def train_data_feeder():
def parametric_fixed_point_quantize(x,
                                    sign=True,
                                    n_init=8,
                                    n_min=2,
                                    n_max=16,
                                    m_init=1,
                                    m_min=-8,
                                    m_max=8,
                                    fix_parameters=False):
    """Parametric version of `fixed_point_quantize` where the
    bitwidth `n` and dynamic range `m` are learnable parameters.

    Args:
        x(~nnabla.Variable): N-D array as input
        sign (bool): keep sign information during quantization.
        n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter.
        n_min (int): lower bound for bitwidth.
        n_max (int): upper bound for bitwidth.
        m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range.
        m_min (float): lower bound for dynamic range.
        m_max (float): upper bound for range.
        fix_parameters (bool): When set to `True`, the negative slope values
            will not be updated.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    def clip_scalar(v, min_value, max_value):
        return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value)

    def broadcast_scalar(v, shape):
        return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False),
                           shape=shape)

    def quantize_pow2(v):
        return 2**F.round(F.log(v) / np.log(2.))

    n = get_parameter_or_create("n", (),
                                ConstantInitializer(n_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)
    m = get_parameter_or_create("m", (),
                                ConstantInitializer(m_init),
                                need_grad=True,
                                as_need_grad=not fix_parameters)

    # ensure that bitwidth is in specified range and an integer
    n_q = F.round(clip_scalar(n, n_min, n_max))
    if sign:
        n_q = n_q - 1

    # ensure that dynamic range is in specified range
    m_q = clip_scalar(m, m_min, m_max)

    # compute step size from dynamic range and make sure that it is a pow2
    d_q = quantize_pow2((2**m_q) / (2**n_q - 1))

    # compute min/max value that we can represent
    x_max = d_q * (2**n_q - 1)
    if sign:
        x_min = -x_max
    else:
        x_min = nn.Variable((1, ), need_grad=False)
        x_min.d = 0.

    # broadcast variables to correct size
    d_q = broadcast_scalar(d_q, shape=x.shape)
    x_min = broadcast_scalar(x_min, shape=x.shape)
    x_max = broadcast_scalar(x_max, shape=x.shape)

    # apply fixed-point quantization
    return d_q * F.round(F.clip_by_value(x, x_min, x_max) / d_q)
示例#14
0
 def clamp(val):
     # Here we don't need to clip max
     return F.clip_by_value(val, min=1e-12, max=1e8)