Exemplo n.º 1
0
def _create_single_cell(cell_fn, num_units, is_residual=False, is_dropout=False, keep_prob=None):
    """Create single RNN cell based on cell_fn."""
    cell = cell_fn(num_units)
    if is_dropout:
        cell = DropoutWrapper(cell, input_keep_prob=keep_prob)
    if is_residual:
        cell = ResidualWrapper(cell)
    return cell
Exemplo n.º 2
0
def attention_decoder(inputs,
                      memory,
                      num_units=None,
                      batch_size=1,
                      inputs_length=None,
                      n_mels=80,
                      reduction=1,
                      default_max_iters=200,
                      is_training=True,
                      scope='attention_decoder',
                      reuse=None):
    """
    Applies a GRU to 'inputs', while attending 'memory'.
    :param inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
    :param memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
    :param num_units: An int. Attention size.
    :param batch_size: An int. Batch size.
    :param inputs_length: An int. Memory length.
    :param n_mels: An int. Number of Mel banks to generate.
    :param reduction: An int. Reduction factor. Paper => 2, 3, 5.
    :param default_max_iters: Default max iteration of decoding.
    :param is_training: running mode.
    :param scope: Optional scope for `variable_scope`.
    :param reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
    :return: A 3d tensor with shape of [N, T, num_units].
    """
    with tf.variable_scope(scope, reuse=reuse):
        # params setting
        if is_training:
            max_iters = None
        else:
            max_iters = default_max_iters
        # max_iters = default_max_iters
        if num_units is None:
            num_units = inputs.get_shape().as_list()[-1]

        # Decoder cell
        decoder_cell = tf.nn.rnn_cell.GRUCell(num_units)

        # Attention
        # [N, T_in, attention_depth]
        attention_cell = AttentionWrapper(decoder_cell,
                                          BahdanauAttention(num_units, memory),
                                          alignment_history=True)

        # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
        # [N, T_in, 2*attention_depth]
        concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

        # Decoder (layers specified bottom to top):
        # [N, T_in, decoder_depth]
        decoder_cell = MultiRNNCell([
            OutputProjectionWrapper(concat_cell, num_units),
            ResidualWrapper(GRUCell(num_units)),
            ResidualWrapper(GRUCell(num_units))
        ],
                                    state_is_tuple=True)

        # Project onto r mel spectrogram (predict r outputs at each RNN step):
        output_cell = OutputProjectionWrapper(decoder_cell, n_mels * reduction)

        decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                    dtype=tf.float32)

        if is_training:
            # helper = TacotronTrainingHelper(batch_size, n_mels, reduction, inputs)
            helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=inputs, sequence_length=inputs_length, time_major=False)
        else:
            helper = TacotronInferenceHelper(batch_size, n_mels, reduction)

        decoder = BasicDecoder(output_cell, helper, decoder_init_state)
        # [N, T_out/r, M*r]
        (decoder_outputs, _), final_decoder_state, _ = dynamic_decode(
            decoder, maximum_iterations=max_iters)

    return decoder_outputs, final_decoder_state