예제 #1
0
    def lstm(self, lstm_in, nb_samples, scope_name, test):
        """
        Apply 3-layered LSTM
        """

        if self.unidirectional:
            lstm_hidden_size = self.hidden_size
        else:
            lstm_hidden_size = self.hidden_size // 2
        h = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples,
                         lstm_hidden_size),
                        need_grad=False)
        c = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples,
                         lstm_hidden_size),
                        need_grad=False)
        h.data.zero()
        c.data.zero()
        lstm_out, _, _ = PF.lstm(lstm_in,
                                 h,
                                 c,
                                 num_layers=self.nb_layers,
                                 bidirectional=not self.unidirectional,
                                 training=not test,
                                 dropout=0.4,
                                 name=scope_name)
        return lstm_out
예제 #2
0
    def call(self, inputs):
        r"""Encoder layer.
        Args:
            inputs (nn.Variable): An input variable of shape (B, T) indicates indices
                of character embeddings.

        Returns:
            nn.Variable: Output variable of shape (T, B, C).
        """
        hp = self._hparams
        with nn.parameter_scope('embeddings'):
            val = np.sqrt(6.0 / (len(hp.vocab) + hp.symbols_embedding_dim))
            inputs = PF.embed(
                inputs,
                n_inputs=len(hp.vocab),
                n_features=hp.symbols_embedding_dim,
                initializer=UniformInitializer(lim=(-val,
                                                    val)))  # (B, T, C=512)

        with nn.parameter_scope('ngrams'):
            out = inputs
            for i in range(hp.encoder_n_convolutions):
                with nn.parameter_scope(f'filter_{i}'):
                    out = conv_norm(out,
                                    out_channels=hp.encoder_embedding_dim,
                                    kernel_size=hp.encoder_kernel_size,
                                    padding=(hp.encoder_kernel_size - 1) // 2,
                                    bias=False,
                                    stride=1,
                                    dilation=1,
                                    w_init_gain='relu',
                                    scope='conv_norm',
                                    channel_last=True)  # (B, C=512, T)
                    out = PF.batch_normalization(out,
                                                 batch_stat=self.training,
                                                 axes=[2])
                    out = F.relu(out)
                    if self.training:
                        # (B, C=512, T) --> (B, T, C=512)
                        out = F.dropout(out, 0.5)

        with nn.parameter_scope('lstm_encoder'):
            out = F.transpose(out, (1, 0, 2))  # (2, 0, 1))
            h = F.constant(shape=(2, 2, hp.batch_size,
                                  hp.encoder_embedding_dim // 2))
            c = F.constant(shape=(2, 2, hp.batch_size,
                                  hp.encoder_embedding_dim // 2))
            out, _, _ = PF.lstm(out,
                                h,
                                c,
                                training=self.training,
                                bidirectional=True)

        return out  # (T, B, C=512)
예제 #3
0
 def lstm(self, lstm_in, nb_samples, scope_name):
     '''
     Apply 3-layered LSTM
     '''
     h = F.constant(shape=(self.nb_layers, self.nb_of_directions,
                           nb_samples, self.hidden_size // 2))
     c = F.constant(shape=(self.nb_layers, self.nb_of_directions,
                           nb_samples, self.hidden_size // 2))
     lstm_out, _, _ = PF.lstm(lstm_in,
                              h,
                              c,
                              num_layers=self.nb_layers,
                              bidirectional=True,
                              training=not self.test,
                              dropout=0.4,
                              name=scope_name)
     return lstm_out
예제 #4
0
def stack_lstm(x, prev_h, prev_c, state_size):
    """
        stacked LSTMs. Consists of 2 layers inside.
    """
    lstm_size = prev_h[0].shape[1]
    next_h = [
        nn.Variable([1, lstm_size], need_grad=True) for _ in range(len(prev_h))
    ]
    next_c = [
        nn.Variable([1, lstm_size], need_grad=True) for _ in range(len(prev_c))
    ]
    for layer_id, (_h, _c) in enumerate(zip(prev_h, prev_c)):
        inputs = x if layer_id == 0 else next_h[layer_id - 1]
        with nn.parameter_scope(str(layer_id)):
            curr_h, curr_c = PF.lstm(inputs, _h, _c, state_size)
        next_h[layer_id] = curr_h
        next_c[layer_id] = curr_c
    return next_h, next_c
예제 #5
0
def test_pf_lstm_execution(g_rng, inshape, w0_init, w_init, b_init, num_layers,
                           dropout, bidirectional, with_bias, hidden_size,
                           training, fix_parameters, rng, ctx, func_name):

    with nn.context_scope(ctx):
        if func_name == "LSTM":
            pytest.skip("Not implemented in CPU.")

        num_directions = 2 if bidirectional else 1
        w0_shape = (num_directions, 4, hidden_size, inshape[2] + hidden_size)
        w_shape = (max(1, num_layers - 1), num_directions, 4, hidden_size,
                   num_directions * hidden_size + hidden_size)
        b_shape = (num_layers, num_directions, 4, hidden_size)

        w0_init = process_param_init(w0_init, w0_shape, g_rng)
        w_init = process_param_init(w_init, w_shape, g_rng)
        b_init = process_param_init(b_init, b_shape, g_rng)
        rng = process_rng(rng)

        kw = {}
        insert_if_not_none(kw, 'w0_init', w0_init)
        insert_if_not_none(kw, 'w_init', w_init)
        insert_if_not_none(kw, 'b_init', b_init)
        insert_if_not_default(kw, 'num_layers', num_layers, 1)
        insert_if_not_default(kw, 'dropout', dropout, 0.0)
        insert_if_not_default(kw, 'bidirectional', bidirectional, False)
        insert_if_not_default(kw, 'training', training, True)
        insert_if_not_none(kw, 'rng', rng)
        insert_if_not_default(kw, 'with_bias', with_bias, True)
        insert_if_not_default(kw, 'fix_parameters', fix_parameters, False)

        x = nn.Variable.from_numpy_array(g_rng.randn(*inshape))
        h = nn.Variable.from_numpy_array(
            g_rng.randn(*(num_layers, num_directions, inshape[1],
                          hidden_size)))
        c = nn.Variable.from_numpy_array(
            g_rng.randn(*(num_layers, num_directions, inshape[1],
                          hidden_size)))

        # Check execution
        y, hn, cn = PF.lstm(x, h, c, **kw)
        y.forward()
        if training:
            y.backward()

        # Check values
        # TODO

        # Check args
        assert y.parent.info.type_name == 'LSTM'
        args = y.parent.info.args

        # Check created parameters
        assert y.parent.inputs[0] == x
        assert y.parent.inputs[1] == h
        assert y.parent.inputs[2] == c
        w0 = nn.get_parameters()['lstm/weight_l0']
        assert w0.shape == w0_shape
        assert w0.need_grad
        assert y.parent.inputs[3].need_grad == (not fix_parameters)
        if isinstance(w0_init, np.ndarray):
            assert np.allclose(w0_init, w0.d)
        if num_layers > 1:
            w = nn.get_parameters()['lstm/weight']
            assert w.shape == w_shape
            assert w.need_grad
            assert y.parent.inputs[4].need_grad == (not fix_parameters)
            if isinstance(w_init, np.ndarray):
                assert np.allclose(w_init, w.d)
        if with_bias:
            b = nn.get_parameters()['lstm/bias']
            assert b.shape == b_shape
            assert b.need_grad
            if num_layers > 1:
                assert y.parent.inputs[5].need_grad == (not fix_parameters)
            else:
                assert y.parent.inputs[4].need_grad == (not fix_parameters)
            if isinstance(b_init, np.ndarray):
                assert np.allclose(b_init, b.d)
예제 #6
0
    def call(self, memory, decoder_inputs=None):
        r"""Return mel-spectrograms, gate outputs and an attention matrix.

        Args:
            memory (nn.Variable): A 3D tensor of shape (B, T, C).
            decoder_inputs (nn.Variable, optional): A 3D tensor with shape of (B, T/r, r*n_mels).
                Shifted log melspectrogram of sound files. Defaults to None.

        Returns:
            nn.Variable: The synthetic mel-spectrograms of shape (B, Ty/r, r*n_mels).
            nn.Variable: The gate outputs of shape (B, Ty).
            nn.Variable: The attention matrix of shape (B, Tx, Ty).
        """
        hp = self._hparams
        mel_shape = hp.n_mels * hp.r

        # initialize decoder states
        decoder_input = F.constant(shape=(hp.batch_size, 1, mel_shape))
        decoder_hidden = F.constant(shape=(1, 1, hp.batch_size,
                                           hp.decoder_rnn_dim))
        decoder_cell = F.constant(shape=(1, 1, hp.batch_size,
                                         hp.decoder_rnn_dim))

        # initialize attention states
        attention_weights = F.constant(shape=(hp.batch_size, 1, hp.text_len))
        attention_weights_cum = F.constant(shape=(hp.batch_size, 1,
                                                  hp.text_len))
        attention_context = F.constant(shape=(hp.batch_size, 1,
                                              hp.encoder_embedding_dim))
        attention_hidden = F.constant(shape=(1, 1, hp.batch_size,
                                             hp.attention_rnn_dim))
        attention_cell = F.constant(shape=(1, 1, hp.batch_size,
                                           hp.attention_rnn_dim))

        # store outputs
        mel_outputs, gate_outputs, alignments = [], [], []

        for i in range(hp.mel_len):
            if i > 0:
                decoder_input = (mel_outputs[-1] if decoder_inputs is None else
                                 decoder_inputs[:, i - 1:i, :])
                if decoder_inputs is None:
                    decoder_input = decoder_input[None, ...]
            # decoder of shape (B, 1, prenet_channels=256)
            decoder_input = prenet(decoder_input,
                                   hp.prenet_channels,
                                   is_training=self.training,
                                   scope='prenet')

            with nn.parameter_scope('attention_rnn'):
                # cell_input of shape (B, 1, prenet_channels[-1] + C=768)
                cell_input = F.concatenate(decoder_input,
                                           attention_context,
                                           axis=2)
                _, attention_hidden, attention_cell = PF.lstm(
                    F.transpose(cell_input, (1, 0, 2)),
                    attention_hidden,
                    attention_cell,
                    training=self.training,
                    name='lstm_attention'
                )  # (1, 1, B, attention_hidden), (1, 1, B, attention_hidden)
                if self.training:
                    attention_hidden = F.dropout(attention_hidden,
                                                 hp.p_attention_dropout)

            with nn.parameter_scope('location_attention'):
                attention_weights_cat = F.concatenate(attention_weights,
                                                      attention_weights_cum,
                                                      axis=1)
                attention_context, attention_weights = location_sensitive_attention(
                    F.transpose(attention_hidden[0], (1, 0, 2)),
                    memory,
                    attention_weights_cat,
                    attention_location_kernel_size=hp.
                    attention_location_kernel_size,
                    attention_n_filters=hp.attention_location_n_filters,
                    attention_dim=hp.attention_dim,
                    is_training=self.training,
                    scope='ls_attention')
                attention_weights_cum += attention_weights
                alignments.append(attention_weights)

            with nn.parameter_scope('decoder_rnn'):
                # (1, B, attention_rnn_dim + encoder_embedding_dim)
                inp_decoder = F.concatenate(attention_hidden[0],
                                            F.transpose(
                                                attention_context, (1, 0, 2)),
                                            axis=2)
                _, decoder_hidden, decoder_cell = PF.lstm(
                    inp_decoder,
                    decoder_hidden,
                    decoder_cell,
                    training=self.training,
                    name='lstm_decoder')
                if self.training:
                    decoder_hidden = F.dropout(decoder_hidden,
                                               hp.p_decoder_dropout)

            with nn.parameter_scope('projection'):
                proj_input = F.concatenate(
                    decoder_hidden[0, 0],
                    F.reshape(attention_context, (hp.batch_size, -1),
                              inplace=False),
                    axis=1)  # (B, decoder_rnn_dim + encoder_embedding_dim)
                decoder_output = affine_norm(proj_input,
                                             mel_shape,
                                             base_axis=1,
                                             with_bias=True,
                                             w_init_gain='affine',
                                             scope='affine')
                mel_outputs.append(decoder_output)

            with nn.parameter_scope('gate_prediction'):
                gate_prediction = affine_norm(proj_input,
                                              1,
                                              base_axis=1,
                                              with_bias=True,
                                              w_init_gain='sigmoid',
                                              scope='affine')
                gate_outputs.append(gate_prediction)

        # (B, T2, n_mels*r)
        mel_outputs = F.stack(*mel_outputs, axis=1)
        gate_outputs = F.concatenate(*gate_outputs, axis=1)  # (B, T2)
        alignments = F.concatenate(*alignments, axis=1)  # (B, T1, T2)

        return mel_outputs, gate_outputs, alignments
예제 #7
0
    def __call__(self, x, test=False):

        # x = PF.mean_subtraction(x, base_axis=0)
        if not self.input_is_spectrogram:
            x = Spectrogram(*STFT(x, n_fft=self.n_fft, n_hop=self.n_hop),
                            power=self.power,
                            mono=(self.nb_channels == 1))

        nb_frames, nb_samples, nb_channels, nb_bins = x.shape

        mix = x

        x = x[..., :self.nb_bins]
        x += F.reshape(self.input_mean,
                       shape=(1, 1, 1, self.nb_bins),
                       inplace=False)
        x *= F.reshape(self.input_scale,
                       shape=(1, 1, 1, self.nb_bins),
                       inplace=False)

        with nn.parameter_scope("fc1"):
            x = PF.affine(x, self.hidden_size, base_axis=2)
            x = PF.batch_normalization(x, batch_stat=not test)
            x = F.tanh(x)

        with nn.parameter_scope("lstm"):
            if self.unidirectional:
                lstm_hidden_size = self.hidden_size
            else:
                lstm_hidden_size = self.hidden_size // 2

            h = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples,
                             lstm_hidden_size),
                            need_grad=False)
            h.d = np.zeros(h.shape)
            c = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples,
                             lstm_hidden_size),
                            need_grad=False)
            c.d = np.zeros(c.shape)
            lstm_out, _, _ = PF.lstm(x,
                                     h,
                                     c,
                                     num_layers=self.nb_layers,
                                     bidirectional=not self.unidirectional,
                                     training=not test)

        x = F.concatenate(x, lstm_out)  # concatenate along last axis

        with nn.parameter_scope("fc2"):
            x = PF.affine(
                x,
                (self.hidden_size),
                base_axis=2,
            )
            x = PF.batch_normalization(x, batch_stat=not test)
            x = F.relu(x)

        with nn.parameter_scope("fc3"):
            x = PF.affine(
                x,
                (nb_channels, nb_bins),
                base_axis=2,
            )
            x = PF.batch_normalization(x, batch_stat=not test)

        x = x.reshape(
            (nb_frames, nb_samples, nb_channels, self.nb_output_bins))

        # apply output scaling
        x *= F.reshape(self.output_scale,
                       shape=(1, 1, 1, self.nb_output_bins),
                       inplace=False)
        x += F.reshape(self.output_mean,
                       shape=(1, 1, 1, self.nb_output_bins),
                       inplace=False)
        x = F.relu(x) * mix

        return x