Exemplo n.º 1
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Global style tokens (GST)
            gst_tokens = tf.get_variable(
                'style_tokens', [hp.num_gst, 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            if is_training:
                # Reference encoder
                reference_embedding = reference_encoder(
                    mel_targets,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    is_training=is_training)

                # Style token layer
                style_embedding = multi_head_attention(
                    num_heads=hp.num_heads,
                    queries=tf.expand_dims(reference_embedding,
                                           axis=1),  # [N, 1, 128]
                    memory=tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                   [batch_size, 1, 1]),  # [N, hp.num_gst, 256]
                    num_units=128)
            else:
                # TODO Add support for reference mode and more effective style control during inference.
                # Randomly select style embedding from gst_tokens for simplicity.
                random_index = tf.random_uniform([batch_size],
                                                 maxval=hp.num_gst,
                                                 dtype=tf.int32)
                style_embedding = tf.nn.embedding_lookup(
                    gst_tokens, random_index)

            # Add style embedding to every text encoder state, applying tanh to
            # compress both encoder state and style embedding to the same scale.
            encoder_outputs += tf.nn.tanh(style_embedding)

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256,
                                  encoder_outputs,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(
                    ZoneoutWrapper(LSTMBlockCell(256),
                                   (0.1, 0.1), is_training)),
                ResidualWrapper(
                    ZoneoutWrapper(LSTMBlockCell(256),
                                   (0.1, 0.1), is_training)),
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            if is_training:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state))
            else:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            tf.logging.info('Initialized Tacotron model. Dimensions: ')
            tf.logging.info('  embedding:               %d' %
                            embedded_inputs.shape[-1])
            tf.logging.info('  prenet out:              %d' %
                            prenet_outputs.shape[-1])
            tf.logging.info('  encoder out:             %d' %
                            encoder_outputs.shape[-1])
            tf.logging.info('  attention out:           %d' %
                            attention_cell.output_size)
            tf.logging.info('  concat attn & out:       %d' %
                            concat_cell.output_size)
            tf.logging.info('  decoder cell out:        %d' %
                            decoder_cell.output_size)
            tf.logging.info('  decoder out (%d frames):  %d' %
                            (hp.outputs_per_step, decoder_outputs.shape[-1]))
            tf.logging.info('  decoder out (1 frame):   %d' %
                            mel_outputs.shape[-1])
            tf.logging.info('  postnet out:             %d' %
                            post_outputs.shape[-1])
            tf.logging.info('  linear out:              %d' %
                            linear_outputs.shape[-1])
Exemplo n.º 2
0
def embedding_attention_seq2seq(encoder_inputs,
                                decoder_inputs,
                                cell,
                                num_encoder_symbols,
                                num_decoder_symbols,
                                embedding_size,
                                num_heads=1,
                                output_projection=None,
                                feed_previous=False,
                                dtype=dtypes.float32,
                                scope=None):
    """Embedding sequence-to-sequence model with attention.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
  embedded encoder_inputs into a state vector. It keeps the outputs of this
  RNN at every step to use for attention later. Next, it embeds decoder_inputs
  by another newly created embedding (of shape [num_decoder_symbols x
  cell.input_size]). Then it runs attention decoder, initialized with the last
  encoder state, on embedded decoder_inputs and attending to encoder outputs.

  Args:
    encoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    decoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    cell: RNNCell defining the cell function and size.
    num_encoder_symbols: integer; number of symbols on the encoder side.
    num_decoder_symbols: integer; number of symbols on the decoder side.
    num_heads: number of attention heads that read from attention_states.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
      shape [num_decoder_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype of the initial RNN state (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
  """
    with vs.variable_scope(scope or "embedding_attention_seq2seq"):
        # Encoder.
        encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols,
                                        embedding_size)
        encoder_outputs, encoder_states = rnn(encoder_cell,
                                              encoder_inputs,
                                              dtype=dtype)

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [
            array_ops.reshape(e, [-1, 1, cell.output_size])
            for e in encoder_outputs
        ]
        attention_states = array_ops.concat(top_states, 1)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, feed_previous)
        else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
            outputs1, states1 = embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, True)
            vs.get_variable_scope().reuse_variables()
            outputs2, states2 = embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, False)

            outputs = control_flow_ops.cond(feed_previous, lambda: outputs1,
                                            lambda: outputs2)
            states = control_flow_ops.cond(feed_previous, lambda: states1,
                                           lambda: states2)
            return outputs, states
Exemplo n.º 3
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None,
                   reference_weight=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            is_teacher_force_generating = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            if hp.use_gst:
                #Global style tokens (GST)
                gst_tokens = tf.get_variable(
                    'style_tokens',
                    [hp.num_gst, hp.style_embed_depth // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=hp.reference_filters,
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(hp.reference_depth),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                if hp.use_gst:
                    # Style attention
                    style_attention = MultiheadAttention(
                        tf.expand_dims(refnet_outputs, axis=1),  # [N, 1, 128]
                        tf.tanh(
                            tf.tile(
                                tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                        num_heads=hp.num_heads,
                        num_units=hp.style_att_dim,
                        attention_type=hp.style_att_type)

                    style_weights, style_embeddings = style_attention.multi_head_attention(
                    )  # [N, 1, 256]
                else:
                    style_embeddings = tf.expand_dims(refnet_outputs,
                                                      axis=1)  # [N, 1, 128]
            elif reference_weight is not None:
                print("Use specific weight for GST.")
                specific_weights = tf.expand_dims(reference_weight, axis=0)
                specific_weights = tf.tile(specific_weights, [hp.num_heads, 1],
                                           name="specific_weights")
                # specific_weights = tf.tile(specific_weights, [hp.num_heads, 1])
                # specific_weights = tf.nn.softmax(specific_weights, axis=-1, name="specific_weights")
                style_embeddings = tf.matmul(specific_weights,
                                             tf.nn.tanh(gst_tokens))
                style_embeddings = tf.expand_dims(style_embeddings, axis=0)
                style_embeddings = tf.tile(style_embeddings,
                                           [batch_size, 1, 1])
                style_embeddings = tf.reshape(
                    style_embeddings,
                    shape=[batch_size, 1, hp.style_embed_depth])
                style_weights = tf.expand_dims(specific_weights, axis=0)
            else:
                print("Use random weight for GST.")
                random_weights = tf.random_uniform([hp.num_heads, hp.num_gst],
                                                   maxval=1.0,
                                                   dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               axis=-1,
                                               name="random_weights")
                style_embeddings = tf.matmul(random_weights,
                                             tf.nn.tanh(gst_tokens))
                style_embeddings = tf.expand_dims(style_embeddings, axis=0)
                style_embeddings = tf.tile(style_embeddings,
                                           [batch_size, 1, 1])
                style_embeddings = tf.reshape(
                    style_embeddings,
                    shape=[batch_size, 1, hp.style_embed_depth])
                style_weights = tf.expand_dims(random_weights, axis=0)

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                style_embeddings,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            # encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1)
            encoder_outputs = encoder_outputs + style_embeddings

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth,
                                  encoder_outputs,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or is_teacher_force_generating:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)
            else:
                helper = TacoTestHelper(batch_size, hp)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_weights = style_weights
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
            self.reference_weight = reference_weight
            log('Initialized Tacotron model. Dimensions: ')
            log('  text embedding:          %d' % embedded_inputs.shape[-1])
            log('  style embedding:         %d' % style_embeddings.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemplo n.º 4
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, 256),
                    #ResidualWrapper(GRUCell(256)),
                    #ResidualWrapper(GRUCell(256))
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)),
                    ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1))
                ],
                state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):
        is_training = linear_targets is not None  # linear_targets가 초기값(None)이면 False
        self.is_randomly_initialized = is_randomly_initialized  # 초기값 False

        with tf.variable_scope('inference') as scope:  # 'inference'라는 이름으로 묶음
            hp = self._hparams
            batch_size = tf.shape(inputs)[
                0]  # 첫번째 차원은 샘플 수, 두번째 차원은 입력 특성 수 (여기선 샘플수)

            # Embeddings
            char_embed_table = tf.get_variable(
                'embedding',
                [len(symbols), hp.embedding_size],
                dtype=tf.
                float32,  # list : variable이 소속될 collection에 대한 리스트 한글의 종류수와 임베딩 크기에 속해있다. , 'embedding이라는 이름의 공유 변수 생성
                initializer=tf.truncated_normal_initializer(stddev=0.5)
            )  # initializer : 초기화한 가중치 dtype : 리턴한 tensor의 타입
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                    tf.nn.embedding_lookup(char_embed_table, inputs)  # inputs의 인덱스에 따라 char_embed_table값 리턴

            self.num_speakers = num_speakers
            if self.num_speakers > 1:  # 다중화자일때
                if hp.speaker_embedding_size != 1:  # hparams의 speaker_embedding_size값이 1이 아닐때
                    speaker_embed_table = tf.get_variable(  # 공유변수 생성
                        'speaker_embedding',  # 'speaker_embedding'이라는 이름의
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.
                        float32,  # num_speakers와 speaker_embedding_size에 속해있는
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))  # 초기화값 가중치
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id
                    )  # speaker의 인덱스에 따라 speaker_embed_table값 리턴 (Tensor)
############################################################## 추가설명 필요
                if hp.model_type == 'deepvoice':  # deepvoice일때
                    if hp.speaker_embedding_size == 1:  # hparams의 speaker_embedding_size값이 1일때
                        before_highway = get_embed(  # def get_embed(inputs, num_inputs, embed_size, name):
                            speaker_id,
                            self.
                            num_speakers,  # speaker_id의 인덱스에 따라 embed_table값 리턴
                            hp.enc_prenet_sizes[-1],
                            "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
##############################################################
                    else:  # hparams의 speaker_embedding_size값이 1이 아닐때
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)
                        # input:x, units:dim, 활성화함수로 softsign사용
                        # lambda함수 예제 (lambda x,y: x + y)(10, 20) =>> 30
                        # tf.layers.dense( inputs, units, activation)
                        # inputs는 앞의 레이어를 정의
                        # units는 이 레이어에 크기를 정의
                        # 마지막으로 activation은 sigmoid나,ReLu와 같은 Activation 함수
                        # dense는 히든레이어를 구현하는 함수이다.
                        # https://bcho.tistory.com/1196

                        before_highway = deep_dense(
                            speaker_embed, hp.enc_prenet_sizes[-1]
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_prenet_sizes[-1] (기본값 128)
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_rnn_size * 2 (기본값 128 * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size
                        )  # 앞 레이어 : speaker_embed 레이어 수 : hp.attention_state_size (기본값 256)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]  # hp.dec_layer_num 수만큼 (기본값 2) 레이어 list

                    speaker_embed = None  # deepvoice does not use speaker_embed directly 딥보이스는 speaker_embed를 바로 사용하지 않는다.
                elif hp.model_type == 'simple':  # modeltype이 deepvoice가 아니라 simple일때
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None  # 레이어 전부 x
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type)
                    )  # multi-speaker model type이 아니라고 에러메세지 출력
            else:  # 스피커의 수가 1명이면
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None  # 레이어 전부 x
            ##############
            # Encoder (특수문자, 한글 자모음text를 숫자로)
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,  #
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')

            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention (중요!)
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(
                            BasicDecoder(output_cell, helper, decoder_init_state),
                            maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                        tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Exemplo n.º 6
0
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Exemplo n.º 7
0
    def initialize(self,
                   inputs,
                   inputs_jp,
                   input_lengths,
                   input_jp_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings

            # [N, T_in, embed_depth=256]

            # Encoder
            #prenet_outputs = prenet(inputs, is_training, hp.prenet_depths)    # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                inputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)
            # print(inputs_jp.eval)
            # print(inputs.eval)
            # print(input_jp_lengths.eval)
            # print(input_lengths.eval)
            encoder_outputs_jp = encoder_cbhg_jp(
                inputs_jp,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)
            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training,
                                     hp.prenet_depths),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Attention JP
            attention_cell_jp = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training,
                                     hp.prenet_depths),
                BahdanauAttention(hp.attention_depth, encoder_outputs_jp),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell_jp = ConcatOutputAndAttentionWrapper(
                attention_cell_jp)  # [N, T_in, 2*attention_depth=512]
            # 以上复制一份,对应修改为日语特征输入,记新的	concat_cell为concat_cell_jp,新增一行连接两个输出

            # # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            print(type(concat_cell))
            print(concat_cell_jp.output_size)
            encoder_out = tf.concat([concat_cell, concat_cell_jp], axis=-1)

            #connect chinese_outputs and japanese_outputs

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(encoder_out, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.inputs_jp = inputs_jp
            self.input_lengths = input_lengths
            self.input_jp_lengths = input_jp_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')

            #log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  encoder out jp:             %d' %
                encoder_outputs_jp.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  attention out jp:           %d' %
                attention_cell_jp.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  concat attn & out jp:       %d' %
                concat_cell_jp.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemplo n.º 8
0
def embedding_attention_seq2seq_context(encoder_inputs,
                                        decoder_inputs,
                                        cell,
                                        num_encoder_symbols,
                                        num_decoder_symbols,
                                        embedding_size,
                                        fc_context_length,
                                        num_heads=1,
                                        output_projection=None,
                                        feed_previous=False,
                                        dtype=dtypes.float32,
                                        scope=None):
    """A seq2seq architecture with two encoders, one for context, one for input DA. The decoder
    uses twice the cell size. Code adapted from TensorFlow examples."""

    with vs.variable_scope(scope or "embedding_attention_seq2seq_context"):

        # split context and real inputs into separate vectors

        # Context input is not a sequence anymore.

        context_inputs = encoder_inputs[0:fc_context_length]
        encoder_inputs = encoder_inputs[fc_context_length:]

        # build separate encoders
        encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols,
                                        embedding_size)
        with vs.variable_scope("context_rnn") as scope:
            temp = tf.reshape(context_inputs, [-1, fc_context_length])  #30])
            context_states = tf.cast(tf.layers.dense(temp,
                                                     units=cell.output_size),
                                     dtype=dtype)
            context_outputs = tf.nn.relu(context_states, name="context_output")
#            context_outputs, context_states = tf06s2s.rnn(
#                encoder_cell, context_inputs, dtype=dtype, scope=scope)
        with vs.variable_scope("input_rnn") as scope:
            encoder_outputs, encoder_states = tf06s2s.rnn(encoder_cell,
                                                          encoder_inputs,
                                                          dtype=dtype,
                                                          scope=scope)

        # concatenate outputs & states
        # adding positional arguments and concatenating output, cell and hidden states
#        encoder_outputs = [array_ops.concat([co, eo], axis=1, name="context-and-encoder-output")
#                           for co, eo in zip(context_outputs, encoder_outputs)]
#        encoder_states=[(array_ops.concat([c1, c2], axis=1), array_ops.concat([h1, h2], axis=1))
#         for (c1, h1), (c2, h2) in zip(context_states, encoder_states)]

# Add activations of FC from context as a new output
        encoder_outputs.insert(0, context_outputs)
        encoder_states.insert(0, (context_states, context_states))

        # calculate a concatenation of encoder outputs to put attention on.
        top_states = [
            array_ops.reshape(e, [-1, 1, cell.output_size])
            for e in encoder_outputs
        ]  #cell.output_size * 2
        #added positional arguements as it was taking axis to be the values
        attention_states = array_ops.concat(axis=1, values=top_states)

        # change the decoder cell to accommodate wider input
        # TODO this will work for BasicLSTMCell and GRUCell, but not for others
        #input_size is not a field in TF 1.0.1
        cell = type(cell)(num_units=(cell.output_size))  #cell.output_size * 2

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, feed_previous)
        else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
            outputs1, states1 = tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, True)
            vs.get_variable_scope().reuse_variables()
            outputs2, states2 = tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, False)

            outputs = control_flow_ops.cond(feed_previous, lambda: outputs1,
                                            lambda: outputs2)
            states = control_flow_ops.cond(feed_previous, lambda: states1,
                                           lambda: states2)
            return outputs, states
Exemplo n.º 9
0
def full_decoding(inputs,
                  encoder_outputs,
                  is_training,
                  mel_targets,
                  batch_size=32):
    # prenet
    # attention
    # concat attention
    # gru gru
    # output

    # inside cell_outputs, there's the states that contain the attentions
    # We make DecoderPrenetWrapper and ConcatAttentionWrapper an RNNCell so that
    # later on there is a helper function that can do everything for us right away
    cell_outputs = tf.contrib.seq2seq.AttentionWrapper(
        DecoderPrenetWrapper(GRUCell(256), is_training=is_training),
        tf.contrib.seq2seq.BahdanauAttention(256,
                                             encoder_outputs,
                                             normalize=True,
                                             probability_fn=tf.nn.softmax),
        alignment_history=True,
        output_attention=False)
    """
        I learn that in AttentionWrapper, if we set alignment_history to true, we will get all the previous alignments
        If we set the output_attention to false, we will get an output for the cell_output instead of the attention, but 
        the attention will be stored in the state.
        
        We do this so we can combine the output the attention.
    """

    output_attention_cell = ConcatAttentionOutputWrapper(cell_outputs)

    decoding_cells = MultiRNNCell([
        OutputProjectionWrapper(output_attention_cell, 256),
        ResidualWrapper(GRUCell(256)),
        ResidualWrapper(GRUCell(256))
    ])

    decoder_outputs = OutputProjectionWrapper(decoding_cells,
                                              hp.num_mels * hp.r_frames)
    decoder_initial_states = decoder_outputs.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

    if is_training:
        helper = TrainingHelper(inputs, mel_targets, hp.num_mels, hp.r_frames)
    else:
        helper = TestingHelper(batch_size=batch_size,
                               output_dim=hp.num_mels,
                               r=hp.r_frames)

    (final_decoder_outputs, sample_ids), decoder_states, _ = dynamic_decode(
        BasicDecoder(decoder_outputs, helper, decoder_initial_states))

    mel_outputs = tf.reshape(final_decoder_outputs,
                             shape=[batch_size, -1, hp.num_mels])
    post_outputs = cbhg(mel_outputs,
                        16,
                        projections=[256, hp.num_mels],
                        scope='post_cbhg')
    linear_outputs = tf.layers.dense(post_outputs, hp.num_freq / 2 + 1)

    return mel_outputs, linear_outputs
Exemplo n.º 10
0
	def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		with tf.variable_scope('inference') as scope:
			is_training = mel_targets is not None and not gta
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams

			# Embeddings
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
				initializer=tf.contrib.layers.xavier_initializer())
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

			#Encoder
			enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training,
				kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels)    
			#Paper doesn't specify what to do with final encoder state
			#So we will simply drop it
			encoder_outputs, encoder_states = bidirectional_LSTM(enc_conv_outputs, input_lengths,
				'encoder_LSTM', is_training=is_training, size=hp.encoder_lstm_units,
				zoneout=hp.zoneout_rate)     

			#Attention
			attention_cell = AttentionWrapper(
				DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism
					zoneout_factor_cell=hp.zoneout_rate,							#based on original tacotron architecture
					zoneout_factor_output=hp.zoneout_rate), is_training),
				LocationSensitiveAttention(hp.attention_dim, encoder_outputs),
				alignment_history=True,
				output_attention=False,
				name='attention_cell')

			#Concat Prenet output with context vector
			concat_cell = ConcatPrenetAndAttentionWrapper(attention_cell)

			#Decoder layers (attention pre-net + 2 unidirectional LSTM Cells)
			decoder_cell = unidirectional_LSTM(concat_cell, is_training,
				layers=hp.decoder_layers, size=hp.decoder_lstm_units,
				zoneout=hp.zoneout_rate)

			#Concat LSTM output with context vector
			concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell)

			#Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation)
			output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step)

			#Define the helper for our decoder
			if (is_training or gta) == True:
				self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

			#We'll only limit decoder time steps during inference (consult hparams.py to modify the value)
			max_iterations = None if is_training else hp.max_iters

			#initial decoder state
			decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Decode
			(decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode(
				CustomDecoder(output_cell, self.helper, decoder_init_state),
				impute_finished=True, #Cut out padded parts (enabled)
				maximum_iterations=max_iterations)

			# Reshape outputs to be one output per entry 
			decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels])

			#Compute residual using post-net
			residual = postnet(decoder_output, is_training,
				kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels)

			#Project residual to same dimension as mel spectrogram
			projected_residual = projection(residual,
				shape=hp.num_mels,
				scope='residual_projection')

			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.mel_outputs = mel_outputs
			self.mel_targets = mel_targets
			log('Initialized Tacotron model. Dimensions: ')
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_outputs.shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  mel out:                  {}'.format(mel_outputs.shape))
Exemplo n.º 11
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   target_lengths,
                   prefixes=None,
                   speaker_ids=None,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # inputs                                                                    # [N, T_in, D_input]
            speaker_embedding_table = tf.get_variable(
                'speaker_embedding_table',
                [hp.num_speakers, hp.speaker_embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            speaker_embedding = tf.nn.embedding_lookup(
                speaker_embedding_table,
                speaker_ids)  # [N, T_in, hp.speaker_embedding_size]
            deep_dense = lambda x, dim: tf.layers.dense(
                x, dim, activation=tf.nn.softsign)
            before_highway = deep_dense(speaker_embedding, 128)
            encoder_rnn_init_state = deep_dense(speaker_embedding, 128 * 2)
            attention_rnn_init_state = deep_dense(speaker_embedding, 256)
            decoder_rnn_init_states = [
                deep_dense(speaker_embedding, 256) for _ in range(2)
            ]
            # Encoder
            prenet_outputs = prenet(inputs, is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state
            )  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)
            # initially, decoder_init_state is a tuple, so we firstly convert it into a list,
            # decoder_init_state[0] is the projection wrapper, its initial state should be zero state
            # finally, convert list state into tuple
            decoder_init_state = list(decoder_init_state)
            for idx, cell in enumerate(decoder_rnn_init_states):
                decoder_init_state[idx + 1] = cell
            decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.speaker_ids = speaker_ids
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.target_lengths = target_lengths
            self.prefixes = prefixes
            log('Initialized Tacotron model. Dimensions: ')
            log('  inputs:                  %d' % inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames): %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemplo n.º 12
0
    def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          speaker_ids: int32 Tensor containing ids of specific speakers
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference'):
            is_training = linear_targets is not None
            batch_size = tf.shape(text_inputs)[0]
            hp = self._hparams
            vocab_size = len(symbols)
            embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim)  # [N, T_in, embd_size]

            # extract speaker embedding if multi-speaker
            with tf.variable_scope('speaker'):
                if hp.num_speakers > 1:
                    speaker_embedding = tf.get_variable('speaker_embed',
                                                        shape=(hp.num_speakers, hp.speaker_embed_dim),
                                                        dtype=tf.float32)
                    # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)?
                    speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids)
                else:
                    speaker_embd = None

            # Encoder
            encoder_outputs = conv_and_lstm(
                    embedded_inputs,
                    input_lengths,
                    conv_layers=hp.encoder_conv_layers,
                    conv_width=hp.encoder_conv_width,
                    conv_channels=hp.encoder_conv_channels,
                    lstm_units=hp.encoder_lstm_units,
                    is_training=is_training,
                    scope='encoder')  # [N, T_in, 512]

            # Attention Mechanism
            attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training,
                                               speaker_embd=speaker_embd, attention_type="location_sensitive")

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                attention_cell,
                LSTMBlockCell(hp.decoder_lstm_units),
                LSTMBlockCell(hp.decoder_lstm_units)], state_is_tuple=True)  # [N, T_in, 1024]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell, helper, decoder_init_state),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Postnet: predicts a residual
            postnet_outputs = postnet(
                    decoder_outputs,
                    layers=hp.postnet_conv_layers,
                    conv_width=hp.postnet_conv_width,
                    channels=hp.postnet_conv_channels,
                    is_training=is_training)
            mel_outputs = decoder_outputs + postnet_outputs

            # Convert to linear using a similar architecture as the encoder:
            expand_outputs = conv_and_lstm(
                    mel_outputs,
                    None,
                    conv_layers=hp.expand_conv_layers,
                    conv_width=hp.expand_conv_width,
                    conv_channels=hp.expand_conv_channels,
                    lstm_units=hp.expand_lstm_units,
                    is_training=is_training,
                    scope='expand')  # [N, T_in, 512]
            linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            # TODO: seems not to work?!?
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = text_inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  concat attn & out:       %d' % attention_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % postnet_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemplo n.º 13
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder, prenet_size=[256, 128]
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_size)  # [N, T_in, prenet_size[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_output_size=256]
                output_size=hp.encoder_output_size)

            # Attention_RNN 用target与encoder_output计算attention
            attention_cell = AttentionWrapper(
                # input_size = 128, output_size = 256
                cell=GRUCell(num_units=hp.attention_depth
                             ),  # 输出size=attention_depth=256
                # input_size = output_size = 256
                attention_mechanism=BahdanauAttention(
                    num_units=hp.attention_depth, memory=encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # attention_RNN前加入prenet, prenet_size=[256, 128], prenet_output_size=128
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_size)

            # 将attention context vector和RNN cell output进行拼接
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # DecodeRNN为2层残差RNN (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(
                        cell=concat_cell,
                        output_size=hp.decoder_depth),  # 512 -> 256
                    ResidualWrapper(cell=GRUCell(hp.decoder_depth)),
                    ResidualWrapper(cell=GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (预测outputs_per_step帧):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            # help决定下个时刻的输入和初始输入
            if is_training:
                helper = TacoTrainingHelper(inputs=inputs,
                                            targets=mel_targets,
                                            output_dim=hp.num_mels,
                                            r=hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size=batch_size,
                                        output_dim=hp.num_mels,
                                        r=hp.outputs_per_step)
            # 解码:预测不重叠的帧, 例如r->(r+1,2r), 2r->(2r+1,3r)....
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper,
                              decoder_init_state),  # 打包成解码器
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, num_mels*r ]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M=80]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_output_size=256]
                output_size=hp.postnet_output_size)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F=1025]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemplo n.º 14
0
def embedding_attention_seq2seq_context(encoder_inputs,
                                        decoder_inputs,
                                        cell,
                                        num_encoder_symbols,
                                        num_decoder_symbols,
                                        embedding_size,
                                        num_heads=1,
                                        output_projection=None,
                                        feed_previous=False,
                                        dtype=dtypes.float32,
                                        scope=None):
    """A seq2seq architecture with two encoders, one for context, one for input DA. The decoder
    uses twice the cell size. Code adapted from TensorFlow examples."""

    with vs.variable_scope(scope or "embedding_attention_seq2seq_context"):

        # split context and real inputs into separate vectors
        context_inputs = encoder_inputs[0:old_div(len(encoder_inputs), 2)]
        encoder_inputs = encoder_inputs[old_div(len(encoder_inputs), 2):]

        # build separate encoders
        encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols,
                                        embedding_size)
        with vs.variable_scope("context_rnn") as scope:
            context_outputs, context_states = tf06s2s.rnn(encoder_cell,
                                                          context_inputs,
                                                          dtype=dtype,
                                                          scope=scope)
        with vs.variable_scope("input_rnn") as scope:
            encoder_outputs, encoder_states = tf06s2s.rnn(encoder_cell,
                                                          encoder_inputs,
                                                          dtype=dtype,
                                                          scope=scope)

        # concatenate outputs & states
        # adding positional arguments and concatenating output, cell and hidden states
        encoder_outputs = [
            array_ops.concat([co, eo],
                             axis=1,
                             name="context-and-encoder-output")
            for co, eo in zip(context_outputs, encoder_outputs)
        ]
        encoder_states = [
            (array_ops.concat([c1, c2],
                              axis=1), array_ops.concat([h1, h2], axis=1))
            for (c1, h1), (c2, h2) in zip(context_states, encoder_states)
        ]

        # calculate a concatenation of encoder outputs to put attention on.
        top_states = [
            array_ops.reshape(e, [-1, 1, cell.output_size * 2])
            for e in encoder_outputs
        ]
        # added positional arguments since these swapped in some TF version
        attention_states = array_ops.concat(axis=1, values=top_states)

        # change the decoder cell to accommodate wider input
        # TODO this will work for BasicLSTMCell and GRUCell, but not for others
        cell = type(cell)(num_units=(cell.output_size * 2))

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, feed_previous)
        else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
            outputs1, states1 = tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, True)
            vs.get_variable_scope().reuse_variables()
            outputs2, states2 = tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, False)

            outputs = control_flow_ops.cond(feed_previous, lambda: outputs1,
                                            lambda: outputs2)
            states = control_flow_ops.cond(feed_previous, lambda: states1,
                                           lambda: states2)
            return outputs, states
Exemplo n.º 15
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   eal=False,
                   locked_alignments=None,
                   logs_enabled=True,
                   flag_trainAlign=False,
                   flag_trainJoint=False,
                   alignScale=1.0,
                   flag_online_eal_eval=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments
        self.flag_trainAlign = flag_trainAlign
        self.flag_trainJoint = flag_trainJoint
        self.alignScale = alignScale
        self.flag_online_eal = (
            eal and (locked_alignments is None)) or flag_online_eal_eval

        if locked_alignments_ is not None:
            if is_training and eal:
                pass
            elif np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper',
                flag_trainAlign=self.flag_trainAlign,
                flag_trainJoint=self.flag_trainJoint
            )  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            elif eal:
                if self.flag_online_eal:
                    helper_gta = TacoTrainingHelper(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
                    helper_eal = TacoTrainingHelper_EAL(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step)
                else:
                    helper = TacoTrainingHelper_EAL(inputs, pml_targets,
                                                    hp.pml_dimension,
                                                    hp.outputs_per_step)
            elif hp.scheduled_sampling:
                helper = TacoScheduledOutputTrainingHelper(
                    inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step,
                    hp.scheduled_sampling_probability)
            else:
                if is_training:
                    log('For training, one of these should be true: gta, eal, hp.scheduled_sampling'
                        )
                else:
                    helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                            hp.outputs_per_step)
                    if flag_online_eal_eval:
                        helper_gta = helper
                        helper_eal = helper

            if not self.flag_online_eal:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates = tf.reshape(
                    decoder_outputs,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs = post_cbhg(
                    pml_intermediates,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs = tf.layers.dense(
                    post_outputs, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                alignments = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(),
                    [1, 2, 0])

            else:
                (decoder_outputs, _
                 ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                     BasicDecoder(output_cell, helper_gta, decoder_init_state),
                     maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates = tf.reshape(
                    decoder_outputs,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs = post_cbhg(
                    pml_intermediates,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs = tf.layers.dense(
                    post_outputs, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                locked_alignments_ = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(),
                    [1, 2, 0])

        with tf.variable_scope('inference_eal') as scope:
            if self.flag_online_eal:
                # Embeddings
                embedding_table_eal = tf.get_variable(
                    'embedding', [len(symbols), hp.embed_depth],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                embedded_inputs_eal = tf.nn.embedding_lookup(
                    embedding_table_eal, inputs)  # [N, T_in, embed_depth=256]

                # Encoder
                prenet_outputs_eal = prenet(
                    embedded_inputs_eal, is_training,
                    hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
                encoder_outputs_eal = encoder_cbhg(
                    prenet_outputs_eal,
                    input_lengths,
                    is_training,  # [N, T_in, encoder_depth=256]
                    hp.encoder_depth)

                #                 import pdb; pdb.set_trace()
                #                 tf.get_variable_scope().reuse_variables()
                # Attention
                #                 tmp = None if flag_online_eal_eval else locked_alignments_
                if flag_online_eal_eval: locked_alignments_ = None

                attention_cell_eal = LockableAttentionWrapper(
                    GRUCell(hp.attention_depth),
                    LocationSensitiveAttention(hp.attention_depth,
                                               encoder_outputs_eal),
                    alignment_history=True,
                    locked_alignments=locked_alignments_,
                    output_attention=False,
                    name='attention_wrapper',
                    flag_trainAlign=self.flag_trainAlign,
                    flag_trainJoint=self.flag_trainJoint
                )  # [N, T_in, attention_depth=256]

                # Apply prenet before concatenation in AttentionWrapper.
                prenet_cell_eal = DecoderPrenetWrapper(attention_cell_eal,
                                                       is_training,
                                                       hp.prenet_depths)

                # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
                concat_cell_eal = ConcatOutputAndAttentionWrapper(
                    prenet_cell_eal)  # [N, T_in, 2*attention_depth=512]

                # Decoder (layers specified bottom to top):
                decoder_cell_eal = MultiRNNCell(
                    [
                        OutputProjectionWrapper(concat_cell_eal,
                                                hp.decoder_depth),
                        ResidualWrapper(GRUCell(hp.decoder_depth)),
                        ResidualWrapper(GRUCell(hp.decoder_depth))
                    ],
                    state_is_tuple=True)  # [N, T_in, decoder_depth=256]

                # Project onto r PML feature vectors (predict r outputs at each RNN step):
                output_cell_eal = OutputProjectionWrapper(
                    decoder_cell_eal, hp.pml_dimension * hp.outputs_per_step)
                decoder_init_state_eal = output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                (
                    decoder_outputs_eal, _
                ), final_decoder_state_eal, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell_eal, helper_eal,
                                 decoder_init_state_eal),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

                # Reshape outputs to be one output per entry
                pml_intermediates_eal = tf.reshape(
                    decoder_outputs_eal,
                    [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

                # Add post-processing CBHG:
                post_outputs_eal = post_cbhg(
                    pml_intermediates_eal,
                    hp.pml_dimension,
                    is_training,  # [N, T_out, postnet_depth=256]
                    hp.postnet_depth)
                pml_outputs_eal = tf.layers.dense(
                    post_outputs_eal, hp.pml_dimension)  # [N, T_out, P]

                # Grab alignments from the final decoder state:
                alignments = tf.transpose(
                    final_decoder_state_eal[0].alignment_history.stack(),
                    [1, 2, 0])

                self.pml_intermediates_eal = pml_intermediates_eal
                self.pml_outputs_eal = pml_outputs_eal

        with tf.variable_scope('inference') as scope:
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            self.attention_cell = attention_cell
            self.locked_alignments = locked_alignments_

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(gta))
                log('  EAL mode:                {}'.format(eal))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Prenet out:              {}'.format(
                    prenet_outputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Postnet out:             {}'.format(
                    post_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
Exemplo n.º 16
0
    def __init__(self, sess, config, api, log_dir, forward, scope=None):
        self.vocab = api.vocab
        self.rev_vocab = api.rev_vocab
        self.vocab_size = len(self.vocab)
        self.topic_vocab = api.topic_vocab
        self.topic_vocab_size = len(self.topic_vocab)
        self.da_vocab = api.dialog_act_vocab
        self.da_vocab_size = len(self.da_vocab)
        self.sess = sess
        self.scope = scope
        self.max_utt_len = config.max_utt_len
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.context_cell_size = config.cxt_cell_size
        self.sent_cell_size = config.sent_cell_size
        self.dec_cell_size = config.dec_cell_size

        with tf.name_scope("io"):
            # all dialog context and known attributes
            self.input_contexts = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None,
                                                        self.max_utt_len),
                                                 name="dialog_context")
            self.floors = tf.placeholder(dtype=tf.int32,
                                         shape=(None, None),
                                         name="floor")
            self.context_lens = tf.placeholder(dtype=tf.int32,
                                               shape=(None, ),
                                               name="context_lens")
            self.topics = tf.placeholder(dtype=tf.int32,
                                         shape=(None, ),
                                         name="topics")
            self.my_profile = tf.placeholder(dtype=tf.float32,
                                             shape=(None, 4),
                                             name="my_profile")
            self.ot_profile = tf.placeholder(dtype=tf.float32,
                                             shape=(None, 4),
                                             name="ot_profile")

            # target response given the dialog context
            self.output_tokens = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None),
                                                name="output_token")
            self.output_lens = tf.placeholder(dtype=tf.int32,
                                              shape=(None, ),
                                              name="output_lens")
            self.output_das = tf.placeholder(dtype=tf.int32,
                                             shape=(None, ),
                                             name="output_dialog_acts")

            # optimization related variables
            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

        max_dialog_len = array_ops.shape(self.input_contexts)[1]
        max_out_len = array_ops.shape(self.output_tokens)[1]
        batch_size = array_ops.shape(self.input_contexts)[0]

        with variable_scope.variable_scope("topicEmbedding"):
            t_embedding = tf.get_variable(
                "embedding", [self.topic_vocab_size, config.topic_embed_size],
                dtype=tf.float32)
            topic_embedding = embedding_ops.embedding_lookup(
                t_embedding, self.topics)

        if config.use_hcf:
            with variable_scope.variable_scope("dialogActEmbedding"):
                d_embedding = tf.get_variable(
                    "embedding", [self.da_vocab_size, config.da_embed_size],
                    dtype=tf.float32)
                da_embedding = embedding_ops.embedding_lookup(
                    d_embedding, self.output_das)

        with variable_scope.variable_scope("wordEmbedding"):
            self.embedding = tf.get_variable(
                "embedding", [self.vocab_size, config.embed_size],
                dtype=tf.float32)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            embedding = self.embedding * embedding_mask

            input_embedding = embedding_ops.embedding_lookup(
                embedding, tf.reshape(self.input_contexts, [-1]))
            input_embedding = tf.reshape(
                input_embedding, [-1, self.max_utt_len, config.embed_size])
            output_embedding = embedding_ops.embedding_lookup(
                embedding, self.output_tokens)

            if config.sent_type == "bow":
                input_embedding, sent_size = get_bow(input_embedding)
                output_embedding, _ = get_bow(output_embedding)

            elif config.sent_type == "rnn":
                sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                             config.keep_prob, 1)
                input_embedding, sent_size = get_rnn_encode(input_embedding,
                                                            sent_cell,
                                                            scope="sent_rnn")
                output_embedding, _ = get_rnn_encode(output_embedding,
                                                     sent_cell,
                                                     self.output_lens,
                                                     scope="sent_rnn",
                                                     reuse=True)
            elif config.sent_type == "bi_rnn":
                fwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                bwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                input_embedding, sent_size = get_bi_rnn_encode(
                    input_embedding,
                    fwd_sent_cell,
                    bwd_sent_cell,
                    scope="sent_bi_rnn")
                output_embedding, _ = get_bi_rnn_encode(output_embedding,
                                                        fwd_sent_cell,
                                                        bwd_sent_cell,
                                                        self.output_lens,
                                                        scope="sent_bi_rnn",
                                                        reuse=True)
            else:
                raise ValueError(
                    "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]")

            # reshape input into dialogs
            input_embedding = tf.reshape(input_embedding,
                                         [-1, max_dialog_len, sent_size])
            if config.keep_prob < 1.0:
                input_embedding = tf.nn.dropout(input_embedding,
                                                config.keep_prob)

            # convert floors into 1 hot
            floor_one_hot = tf.one_hot(tf.reshape(self.floors, [-1]),
                                       depth=2,
                                       dtype=tf.float32)
            floor_one_hot = tf.reshape(floor_one_hot, [-1, max_dialog_len, 2])

            joint_embedding = tf.concat([input_embedding, floor_one_hot], 2,
                                        "joint_embedding")

        with variable_scope.variable_scope("contextRNN"):
            enc_cell = self.get_rnncell(config.cell_type,
                                        self.context_cell_size,
                                        keep_prob=1.0,
                                        num_layer=config.num_layer)
            # and enc_last_state will be same as the true last state
            _, enc_last_state = tf.nn.dynamic_rnn(
                enc_cell,
                joint_embedding,
                dtype=tf.float32,
                sequence_length=self.context_lens)

            if config.num_layer > 1:
                enc_last_state = tf.concat(enc_last_state, 1)

        # combine with other attributes
        if config.use_hcf:
            attribute_embedding = da_embedding
            attribute_fc1 = layers.fully_connected(attribute_embedding,
                                                   30,
                                                   activation_fn=tf.tanh,
                                                   scope="attribute_fc1")

        cond_list = [
            topic_embedding, self.my_profile, self.ot_profile, enc_last_state
        ]
        cond_embedding = tf.concat(cond_list, 1)

        with variable_scope.variable_scope("recognitionNetwork"):
            if config.use_hcf:
                recog_input = tf.concat(
                    [cond_embedding, output_embedding, attribute_fc1], 1)
            else:
                recog_input = tf.concat([cond_embedding, output_embedding], 1)
            self.recog_mulogvar = recog_mulogvar = layers.fully_connected(
                recog_input,
                config.latent_size * 2,
                activation_fn=None,
                scope="muvar")
            recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1)

        with variable_scope.variable_scope("priorNetwork"):
            # P(XYZ)=P(Z|X)P(X)P(Y|X,Z)
            prior_fc1 = layers.fully_connected(cond_embedding,
                                               np.maximum(
                                                   config.latent_size * 2,
                                                   100),
                                               activation_fn=tf.tanh,
                                               scope="fc1")
            prior_mulogvar = layers.fully_connected(prior_fc1,
                                                    config.latent_size * 2,
                                                    activation_fn=None,
                                                    scope="muvar")
            prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1)

            # use sampled Z or posterior Z
            latent_sample = tf.cond(
                self.use_prior,
                lambda: sample_gaussian(prior_mu, prior_logvar),
                lambda: sample_gaussian(recog_mu, recog_logvar))

        with variable_scope.variable_scope("generationNetwork"):
            gen_inputs = tf.concat([cond_embedding, latent_sample], 1)

            # BOW loss
            bow_fc1 = layers.fully_connected(gen_inputs,
                                             400,
                                             activation_fn=tf.tanh,
                                             scope="bow_fc1")
            if config.keep_prob < 1.0:
                bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob)
            self.bow_logits = layers.fully_connected(bow_fc1,
                                                     self.vocab_size,
                                                     activation_fn=None,
                                                     scope="bow_project")

            # Y loss
            if config.use_hcf:
                meta_fc1 = layers.fully_connected(gen_inputs,
                                                  400,
                                                  activation_fn=tf.tanh,
                                                  scope="meta_fc1")
                if config.keep_prob < 1.0:
                    meta_fc1 = tf.nn.dropout(meta_fc1, config.keep_prob)
                self.da_logits = layers.fully_connected(meta_fc1,
                                                        self.da_vocab_size,
                                                        scope="da_project")
                da_prob = tf.nn.softmax(self.da_logits)
                pred_attribute_embedding = tf.matmul(da_prob, d_embedding)
                if forward:
                    selected_attribute_embedding = pred_attribute_embedding
                else:
                    selected_attribute_embedding = attribute_embedding
                dec_inputs = tf.concat(
                    [gen_inputs, selected_attribute_embedding], 1)
            else:
                self.da_logits = tf.zeros((batch_size, self.da_vocab_size))
                dec_inputs = gen_inputs

            # Decoder
            if config.num_layer > 1:
                dec_init_state = [
                    layers.fully_connected(dec_inputs,
                                           self.dec_cell_size,
                                           activation_fn=None,
                                           scope="init_state-%d" % i)
                    for i in range(config.num_layer)
                ]
                dec_init_state = tuple(dec_init_state)
            else:
                dec_init_state = layers.fully_connected(dec_inputs,
                                                        self.dec_cell_size,
                                                        activation_fn=None,
                                                        scope="init_state")

        with variable_scope.variable_scope("decoder"):
            dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size,
                                        config.keep_prob, config.num_layer)
            dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size)

            if forward:
                loop_func = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state,
                    embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=selected_attribute_embedding)
                dec_input_embedding = None
                dec_seq_lens = None
            else:
                loop_func = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state, selected_attribute_embedding)
                dec_input_embedding = embedding_ops.embedding_lookup(
                    embedding, self.output_tokens)
                dec_input_embedding = dec_input_embedding[:, 0:-1, :]
                dec_seq_lens = self.output_lens - 1

                if config.keep_prob < 1.0:
                    dec_input_embedding = tf.nn.dropout(
                        dec_input_embedding, config.keep_prob)

                # apply word dropping. Set dropped word to 0
                if config.dec_keep_prob < 1.0:
                    keep_mask = tf.less_equal(
                        tf.random_uniform((batch_size, max_out_len - 1),
                                          minval=0.0,
                                          maxval=1.0), config.dec_keep_prob)
                    keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2)
                    dec_input_embedding = dec_input_embedding * keep_mask
                    dec_input_embedding = tf.reshape(
                        dec_input_embedding,
                        [-1, max_out_len - 1, config.embed_size])

            dec_outs, _, final_context_state = dynamic_rnn_decoder(
                dec_cell,
                loop_func,
                inputs=dec_input_embedding,
                sequence_length=dec_seq_lens)
            if final_context_state is not None:
                final_context_state = final_context_state[:, 0:array_ops.
                                                          shape(dec_outs)[1]]
                mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2)))
                self.dec_out_words = tf.multiply(
                    tf.reverse(final_context_state, axis=[1]), mask)
            else:
                self.dec_out_words = tf.argmax(dec_outs, 2)

        if not forward:
            with variable_scope.variable_scope("loss"):
                labels = self.output_tokens[:, 1:]
                label_mask = tf.to_float(tf.sign(labels))

                rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_outs, labels=labels)
                rc_loss = tf.reduce_sum(rc_loss * label_mask,
                                        reduction_indices=1)
                self.avg_rc_loss = tf.reduce_mean(rc_loss)
                # used only for perpliexty calculation. Not used for optimzation
                self.rc_ppl = tf.exp(
                    tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask))
                """ as n-trial multimodal distribution. """
                tile_bow_logits = tf.tile(tf.expand_dims(self.bow_logits, 1),
                                          [1, max_out_len - 1, 1])
                bow_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=tile_bow_logits, labels=labels) * label_mask
                bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1)
                self.avg_bow_loss = tf.reduce_mean(bow_loss)

                # reconstruct the meta info about X
                if config.use_hcf:
                    da_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.da_logits, labels=self.output_das)
                    self.avg_da_loss = tf.reduce_mean(da_loss)
                else:
                    self.avg_da_loss = 0.0

                kld = gaussian_kld(recog_mu, recog_logvar, prior_mu,
                                   prior_logvar)
                self.avg_kld = tf.reduce_mean(kld)
                if log_dir is not None:
                    kl_weights = tf.minimum(
                        tf.to_float(self.global_t) / config.full_kl_step, 1.0)
                else:
                    kl_weights = tf.constant(1.0)

                self.kl_w = kl_weights
                self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld
                aug_elbo = self.avg_bow_loss + self.avg_da_loss + self.elbo

                tf.summary.scalar("da_loss", self.avg_da_loss)
                tf.summary.scalar("rc_loss", self.avg_rc_loss)
                tf.summary.scalar("elbo", self.elbo)
                tf.summary.scalar("kld", self.avg_kld)
                tf.summary.scalar("bow_loss", self.avg_bow_loss)

                self.summary_op = tf.summary.merge_all()

                self.log_p_z = norm_log_liklihood(latent_sample, prior_mu,
                                                  prior_logvar)
                self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu,
                                                     recog_logvar)
                self.est_marginal = tf.reduce_mean(rc_loss + bow_loss -
                                                   self.log_p_z +
                                                   self.log_q_z_xy)

            self.optimize(sess, config, aug_elbo, log_dir)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemplo n.º 17
0
def embedding_attention_seq2seq(encoder_inputs,
                                decoder_inputs,
                                cell,
                                num_encoder_symbols,
                                num_decoder_symbols,
                                embedding_size,
                                num_heads=1,
                                output_projection=None,
                                feed_previous=False,
                                dtype=None,
                                scope=None,
                                initial_state_attention=False,
                                copy=False,
                                attn_type="linear"):
    """Embedding sequence-to-sequence model with attention.

    This model first embeds encoder_inputs by a newly created embedding (of shape
    [num_encoder_symbols x input_size]). Then it runs an RNN to encode
    embedded encoder_inputs into a state vector. It keeps the outputs of this
    RNN at every step to use for attention later. Next, it embeds decoder_inputs
    by another newly created embedding (of shape [num_decoder_symbols x
    input_size]). Then it runs attention decoder, initialized with the last
    encoder state, on embedded decoder_inputs and attending to encoder outputs.

    Warning: when output_projection is None, the size of the attention vectors
    and variables will be made proportional to num_decoder_symbols, can be large.

    Args:
      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      num_encoder_symbols: Integer; number of symbols on the encoder side.
      num_decoder_symbols: Integer; number of symbols on the decoder side.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      num_heads: Number of attention heads that read from attention_states.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_decoder_symbols] and B has
        shape [num_decoder_symbols]; if provided and feed_previous=True, each
        fed previous output will first be multiplied by W and added B.
      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
        of decoder_inputs will be used (the "GO" symbol), and all other decoder
        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
        If False, decoder_inputs are used as given (the standard decoder case).
      dtype: The dtype of the initial RNN state (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_seq2seq".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states.
      copy: If True use a copy mechanism in decoding to copy from encoder inputs
      attn_type: Attn type to use
    Returns:
      A tuple of the form (outputs, state), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x num_decoder_symbols] containing the generated
          outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq",
                                       dtype=dtype) as scope:
        dtype = scope.dtype
        # Encoder.
        encoder_cell = EmbeddingWrapper(cell,
                                        embedding_classes=num_encoder_symbols,
                                        embedding_size=embedding_size)
        encoder_outputs, encoder_state = rnn.static_rnn(encoder_cell,
                                                        encoder_inputs,
                                                        dtype=dtype)

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [
            array_ops.reshape(e, [-1, 1, cell.output_size])
            for e in encoder_outputs
        ]
        attention_states = array_ops.concat(top_states, 1)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        # Modify num_decoder symbols to include len of src
        if isinstance(feed_previous, bool):
            return embedding_attention_decoder(
                decoder_inputs,
                encoder_inputs,
                encoder_state,
                attention_states,
                cell,
                num_decoder_symbols,
                embedding_size,
                num_heads=num_heads,
                output_size=output_size,
                output_projection=output_projection,
                feed_previous=feed_previous,
                initial_state_attention=initial_state_attention,
                copy=copy,
                attn_type=attn_type)

        def decoder(feed_previous_bool):
            reuse = None if feed_previous_bool else True
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(), reuse=reuse) as scope:
                outputs, state = embedding_attention_decoder(
                    decoder_inputs,
                    encoder_state,
                    attention_states,
                    cell,
                    num_decoder_symbols,
                    embedding_size,
                    num_heads=num_heads,
                    output_size=output_size,
                    output_projection=output_projection,
                    feed_previous=feed_previous_bool,
                    update_embedding_for_previous=False,
                    initial_state_attention=initial_state_attention)
                state_list = [state]
                if nest.is_sequence(state):
                    state_list = nest.flatten(state)
                return outputs + state_list

        outputs_and_state = control_flow_ops.cond(feed_previous,
                                                  lambda: decoder(True),
                                                  lambda: decoder(False))
        outputs_len = len(
            decoder_inputs)  # Outputs length same as decoder inputs.
        state_list = outputs_and_state[outputs_len:]
        state = state_list[0]
        if nest.is_sequence(encoder_state):
            state = nest.pack_sequence_as(structure=encoder_state,
                                          flat_sequence=state_list)
        return outputs_and_state[:outputs_len], state
Exemplo n.º 18
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        '''Initializes the model for inference.

        Sets "pml_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            is_training = pml_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_mechanism = BahdanauAttention(hp.attention_depth,
                                                    encoder_outputs)

            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                attention_mechanism,
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                               hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            self.attention_cell = attention_cell

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  embedding:               %d' %
                    embedded_inputs.shape[-1])
                log('  prenet out:              %d' % prenet_outputs.shape[-1])
                log('  encoder out:             %d' %
                    encoder_outputs.shape[-1])
                log('  attention out:           %d' %
                    attention_cell.output_size)
                log('  concat attn & out:       %d' % concat_cell.output_size)
                log('  decoder cell out:        %d' % decoder_cell.output_size)
                log('  decoder out (%d frames):  %d' %
                    (hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  decoder out (1 frame):   %d' % pml_outputs.shape[-1])
Exemplo n.º 19
0
    def initialize(
            self, inputs, input_lengths, num_speakers, speaker_id,
            mel_targets=None, linear_targets=None, loss_coeff=None,
            rnn_decoder_test_mode=False, is_randomly_initialized=False,
        ):

        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        # get_variable() 사용 시, 'inference' scope 안에 있는 변수 가져옴
        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings
            char_embed_table = tf.get_variable(
                    'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                    tf.nn.embedding_lookup(char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                            'speaker_embedding',
                            [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32,
                            initializer=tf.truncated_normal_initializer(stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                                speaker_id, self.num_speakers, 
                                hp.enc_prenet_sizes[-1], "before_highway")
                        encoder_rnn_init_state = get_embed(
                                speaker_id, self.num_speakers, 
                                hp.enc_rnn_size * 2, "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                                speaker_id, self.num_speakers, 
                                hp.attention_state_size, "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers, 
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(
                                speaker_embed, hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                                speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                                speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [deep_dense(
                                speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num)]

                    speaker_embed = None # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(" [!] Unkown multi-speaker model type: {}".format(hp.model_type))
            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(char_embedded_inputs, is_training,
                    hp.enc_prenet_sizes, hp.dropout_prob,
                    scope='prenet')

            encoder_outputs = cbhg(
                    prenet_outputs, input_lengths, is_training,
                    hp.enc_bank_size, hp.enc_bank_channel_size,
                    hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size,
                    hp.enc_proj_sizes, hp.enc_proj_width,
                    scope="encoder_cbhg",
                    before_highway=before_highway,
                    encoder_rnn_init_state=encoder_rnn_init_state)


            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                    tf.bool, shape=(), name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                    tf.float32, shape=[None, None, None], name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                    GRUCell(hp.attention_state_size),
                    speaker_embed,
                    is_training, hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                        hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                        hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                        hp.attention_size, encoder_outputs, scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                        hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                        hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(
                        hp.attention_size, encoder_outputs, shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(hp.attention_type))

            attention_cell = AttentionWrapper(
                    dec_prenet_outputs,
                    attention_mechanism,
                    self.is_manual_attention,
                    self.manual_alignments,
                    initial_cell_state=attention_rnn_init_state,
                    alignment_history=True,
                    output_attention=False
            )

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                    attention_cell, embed_to_concat=speaker_embed)
                        
            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                    decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(
                        inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                        rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(
                        batch_size, hp.num_mels, hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(
                            BasicDecoder(output_cell, helper, decoder_init_state),
                            maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(
                    decoder_outputs, [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(
                    mel_outputs, None, is_training,
                    hp.post_bank_size, hp.post_bank_channel_size,
                    hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size,
                    hp.post_proj_sizes, hp.post_proj_width,
                    scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                        expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                        tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)    # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                    final_decoder_state[0].alignment_history.stack(), [1, 2, 0])


            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('='*40)
            log(' model_type: %s' % hp.model_type)
            log('='*40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' % char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' % speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' % attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' % (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Exemplo n.º 20
0
    def __init__(self, sess, config, api, log_dir, forward, scope=None):
        # self.self_label = tf.placeholder(dtype=tf.bool,shape=(None), name="self_label")
        self.self_label = False
        self.vocab = api.vocab
        self.rev_vocab = api.rev_vocab
        self.vocab_size = len(self.vocab)
        self.sess = sess
        self.scope = scope
        self.max_utt_len = config.max_utt_len
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.context_cell_size = config.cxt_cell_size
        self.sent_cell_size = config.sent_cell_size
        self.dec_cell_size = config.dec_cell_size

        with tf.name_scope("io"):
            self.input_contexts = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None),
                                                 name="dialog_context")
            self.context_lens = tf.placeholder(dtype=tf.int32,
                                               shape=(None, ),
                                               name="context_lens")

            self.output_tokens = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None),
                                                name="output_token")
            self.output_lens = tf.placeholder(dtype=tf.int32,
                                              shape=(None, ),
                                              name="output_lens")

            # optimization related variables
            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

        max_input_len = array_ops.shape(self.input_contexts)[1]
        max_out_len = array_ops.shape(self.output_tokens)[1]
        batch_size = array_ops.shape(self.input_contexts)[0]

        with variable_scope.variable_scope("wordEmbedding"):
            self.embedding = tf.get_variable(
                "embedding", [self.vocab_size, config.embed_size],
                dtype=tf.float32)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            embedding = self.embedding * embedding_mask

            input_embedding = embedding_ops.embedding_lookup(
                embedding, self.input_contexts)

            output_embedding = embedding_ops.embedding_lookup(
                embedding, self.output_tokens)

            if config.sent_type == "rnn":
                sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                             config.keep_prob, 1)
                input_embedding, sent_size = get_rnn_encode(input_embedding,
                                                            sent_cell,
                                                            scope="sent_rnn")
                output_embedding, _ = get_rnn_encode(output_embedding,
                                                     sent_cell,
                                                     self.output_lens,
                                                     scope="sent_rnn",
                                                     reuse=True)
            elif config.sent_type == "bi_rnn":
                fwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                bwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                input_embedding, sent_size = get_bi_rnn_encode(
                    input_embedding,
                    fwd_sent_cell,
                    bwd_sent_cell,
                    self.context_lens,
                    scope="sent_bi_rnn")
                output_embedding, _ = get_bi_rnn_encode(output_embedding,
                                                        fwd_sent_cell,
                                                        bwd_sent_cell,
                                                        self.output_lens,
                                                        scope="sent_bi_rnn",
                                                        reuse=True)
            else:
                raise ValueError(
                    "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]")

            # reshape input into dialogs
            if config.keep_prob < 1.0:
                input_embedding = tf.nn.dropout(input_embedding,
                                                config.keep_prob)

        with variable_scope.variable_scope("contextRNN"):
            enc_cell = self.get_rnncell(config.cell_type,
                                        self.context_cell_size,
                                        keep_prob=1.0,
                                        num_layer=config.num_layer)
            # and enc_last_state will be same as the true last state
            input_embedding = tf.expand_dims(input_embedding, axis=2)
            _, enc_last_state = tf.nn.dynamic_rnn(
                enc_cell,
                input_embedding,
                dtype=tf.float32,
                sequence_length=self.context_lens)

            if config.num_layer > 1:
                if config.cell_type == 'lstm':
                    enc_last_state = [temp.h for temp in enc_last_state]

                enc_last_state = tf.concat(enc_last_state, 1)
            else:
                if config.cell_type == 'lstm':
                    enc_last_state = enc_last_state.h

        # input [enc_last_state, output_embedding] -- [c, x] --->z
        with variable_scope.variable_scope("recognitionNetwork"):
            recog_input = tf.concat([enc_last_state, output_embedding], 1)
            self.recog_mulogvar = recog_mulogvar = layers.fully_connected(
                recog_input,
                config.latent_size * 2,
                activation_fn=None,
                scope="muvar")
            recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1)

        with variable_scope.variable_scope("priorNetwork"):
            # P(XYZ)=P(Z|X)P(X)P(Y|X,Z)
            prior_fc1 = layers.fully_connected(enc_last_state,
                                               np.maximum(
                                                   config.latent_size * 2,
                                                   100),
                                               activation_fn=tf.tanh,
                                               scope="fc1")
            prior_mulogvar = layers.fully_connected(prior_fc1,
                                                    config.latent_size * 2,
                                                    activation_fn=None,
                                                    scope="muvar")
            prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1)

            # use sampled Z or posterior Z
            latent_sample = tf.cond(
                self.use_prior,
                lambda: sample_gaussian(prior_mu, prior_logvar),
                lambda: sample_gaussian(recog_mu, recog_logvar))

        with variable_scope.variable_scope("label_encoder"):
            le_embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            le_embedding = self.embedding * le_embedding_mask

            le_input_embedding = embedding_ops.embedding_lookup(
                le_embedding, self.input_contexts)

            le_output_embedding = embedding_ops.embedding_lookup(
                le_embedding, self.output_tokens)

            if config.sent_type == "rnn":
                le_sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                                config.keep_prob, 1)
                le_input_embedding, le_sent_size = get_rnn_encode(
                    le_input_embedding, le_sent_cell, scope="sent_rnn")
                le_output_embedding, _ = get_rnn_encode(le_output_embedding,
                                                        le_sent_cell,
                                                        self.output_lens,
                                                        scope="sent_rnn",
                                                        reuse=True)
            elif config.sent_type == "bi_rnn":
                le_fwd_sent_cell = self.get_rnncell("gru",
                                                    self.sent_cell_size,
                                                    keep_prob=1.0,
                                                    num_layer=1)
                le_bwd_sent_cell = self.get_rnncell("gru",
                                                    self.sent_cell_size,
                                                    keep_prob=1.0,
                                                    num_layer=1)
                le_input_embedding, le_sent_size = get_bi_rnn_encode(
                    le_input_embedding,
                    le_fwd_sent_cell,
                    le_bwd_sent_cell,
                    self.context_lens,
                    scope="sent_bi_rnn")
                le_output_embedding, _ = get_bi_rnn_encode(le_output_embedding,
                                                           le_fwd_sent_cell,
                                                           le_bwd_sent_cell,
                                                           self.output_lens,
                                                           scope="sent_bi_rnn",
                                                           reuse=True)
            else:
                raise ValueError(
                    "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]")

            # reshape input into dialogs
            if config.keep_prob < 1.0:
                le_input_embedding = tf.nn.dropout(le_input_embedding,
                                                   config.keep_prob)

        # [le_enc_last_state, le_output_embedding]
        with variable_scope.variable_scope("lecontextRNN"):
            enc_cell = self.get_rnncell(config.cell_type,
                                        self.context_cell_size,
                                        keep_prob=1.0,
                                        num_layer=config.num_layer)
            # and enc_last_state will be same as the true last state
            le_input_embedding = tf.expand_dims(le_input_embedding, axis=2)
            _, le_enc_last_state = tf.nn.dynamic_rnn(
                enc_cell,
                le_input_embedding,
                dtype=tf.float32,
                sequence_length=self.context_lens)

            if config.num_layer > 1:
                if config.cell_type == 'lstm':
                    le_enc_last_state = [temp.h for temp in le_enc_last_state]

                le_enc_last_state = tf.concat(le_enc_last_state, 1)
            else:
                if config.cell_type == 'lstm':
                    le_enc_last_state = le_enc_last_state.h
            best_en = tf.concat([le_enc_last_state, le_output_embedding], 1)

        with variable_scope.variable_scope("ggammaNet"):
            enc_cell = self.get_rnncell(config.cell_type,
                                        200,
                                        keep_prob=1.0,
                                        num_layer=config.num_layer)
            # and enc_last_state will be same as the true last state
            input_embedding = tf.expand_dims(best_en, axis=2)
            _, zlabel = tf.nn.dynamic_rnn(enc_cell,
                                          input_embedding,
                                          dtype=tf.float32,
                                          sequence_length=self.context_lens)

            if config.num_layer > 1:
                if config.cell_type == 'lstm':
                    zlabel = [temp.h for temp in enc_last_state]

                zlabel = tf.concat(zlabel, 1)
            else:
                if config.cell_type == 'lstm':
                    zlabel = zlabel.h

        with variable_scope.variable_scope("generationNetwork"):
            gen_inputs = tf.concat([enc_last_state, latent_sample], 1)

            dec_inputs = gen_inputs
            selected_attribute_embedding = None

            # Decoder_init_state
            if config.num_layer > 1:
                dec_init_state = []
                for i in range(config.num_layer):
                    temp_init = layers.fully_connected(dec_inputs,
                                                       self.dec_cell_size,
                                                       activation_fn=None,
                                                       scope="init_state-%d" %
                                                       i)
                    if config.cell_type == 'lstm':
                        temp_init = rnn_cell.LSTMStateTuple(
                            temp_init, temp_init)

                    dec_init_state.append(temp_init)

                dec_init_state = tuple(dec_init_state)
            else:
                dec_init_state = layers.fully_connected(dec_inputs,
                                                        self.dec_cell_size,
                                                        activation_fn=None,
                                                        scope="init_state")
                if config.cell_type == 'lstm':
                    dec_init_state = rnn_cell.LSTMStateTuple(
                        dec_init_state, dec_init_state)

        with variable_scope.variable_scope("generationNetwork1"):
            gen_inputs_sl = tf.concat([le_enc_last_state, zlabel], 1)

            dec_inputs_sl = gen_inputs_sl
            selected_attribute_embedding = None

            # Decoder_init_state
            if config.num_layer > 1:
                dec_init_state_sl = []
                for i in range(config.num_layer):
                    temp_init = layers.fully_connected(dec_inputs_sl,
                                                       self.dec_cell_size,
                                                       activation_fn=None,
                                                       scope="init_state-%d" %
                                                       i)
                    if config.cell_type == 'lstm':
                        temp_init = rnn_cell.LSTMStateTuple(
                            temp_init, temp_init)

                    dec_init_state_sl.append(temp_init)

                dec_init_state_sl = tuple(dec_init_state_sl)
            else:
                dec_init_state_sl = layers.fully_connected(dec_inputs_sl,
                                                           self.dec_cell_size,
                                                           activation_fn=None,
                                                           scope="init_state")
                if config.cell_type == 'lstm':
                    dec_init_state_sl = rnn_cell.LSTMStateTuple(
                        dec_init_state_sl, dec_init_state_sl)

        with variable_scope.variable_scope("decoder"):
            dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size,
                                        config.keep_prob, config.num_layer)
            dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size)

            if forward:
                loop_func = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state,
                    embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=selected_attribute_embedding)
                loop_func_sl = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state_sl,
                    le_embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=selected_attribute_embedding)

                dec_input_embedding = None
                dec_input_embedding_sl = None
                dec_seq_lens = None
            else:
                loop_func = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state, selected_attribute_embedding)
                loop_func_sl = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state_sl, selected_attribute_embedding)

                dec_input_embedding = embedding_ops.embedding_lookup(
                    embedding, self.output_tokens)
                dec_input_embedding_sl = embedding_ops.embedding_lookup(
                    le_embedding, self.output_tokens)

                dec_input_embedding = dec_input_embedding[:, 0:-1, :]
                dec_input_embedding_sl = dec_input_embedding_sl[:, 0:-1, :]

                dec_seq_lens = self.output_lens - 1

                if config.keep_prob < 1.0:
                    dec_input_embedding = tf.nn.dropout(
                        dec_input_embedding, config.keep_prob)
                    dec_input_embedding_sl = tf.nn.dropout(
                        dec_input_embedding_sl, config.keep_prob)

                # apply word dropping. Set dropped word to 0
                if config.dec_keep_prob < 1.0:
                    keep_mask = tf.less_equal(
                        tf.random_uniform((batch_size, max_out_len - 1),
                                          minval=0.0,
                                          maxval=1.0), config.dec_keep_prob)
                    keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2)
                    dec_input_embedding = dec_input_embedding * keep_mask
                    dec_input_embedding_sl = dec_input_embedding_sl * keep_mask
                    dec_input_embedding = tf.reshape(
                        dec_input_embedding,
                        [-1, max_out_len - 1, config.embed_size])
                    dec_input_embedding_sl = tf.reshape(
                        dec_input_embedding_sl,
                        [-1, max_out_len - 1, config.embed_size])

            dec_outs, _, final_context_state = dynamic_rnn_decoder(
                dec_cell,
                loop_func,
                inputs=dec_input_embedding,
                sequence_length=dec_seq_lens)

            dec_outs_sl, _, final_context_state_sl = dynamic_rnn_decoder(
                dec_cell,
                loop_func_sl,
                inputs=dec_input_embedding_sl,
                sequence_length=dec_seq_lens)

            if final_context_state is not None:
                final_context_state = final_context_state[:, 0:array_ops.
                                                          shape(dec_outs)[1]]
                mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2)))
                self.dec_out_words = tf.multiply(
                    tf.reverse(final_context_state, axis=[1]), mask)
            else:
                self.dec_out_words = tf.argmax(dec_outs, 2)

            if final_context_state_sl is not None:
                final_context_state_sl = final_context_state_sl[:, 0:array_ops.
                                                                shape(
                                                                    dec_outs_sl
                                                                )[1]]
                mask_sl = tf.to_int32(
                    tf.sign(tf.reduce_max(dec_outs_sl, axis=2)))
                self.dec_out_words_sl = tf.multiply(
                    tf.reverse(final_context_state_sl, axis=[1]), mask_sl)
            else:
                self.dec_out_words_sl = tf.argmax(dec_outs_sl, 2)

        if not forward:
            with variable_scope.variable_scope("loss"):

                labels = self.output_tokens[:, 1:]
                label_mask = tf.to_float(tf.sign(labels))

                rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_outs, labels=labels)

                rc_loss = tf.reduce_sum(rc_loss * label_mask,
                                        reduction_indices=1)
                self.avg_rc_loss = tf.reduce_mean(rc_loss)

                sl_rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_outs_sl, labels=labels)
                sl_rc_loss = tf.reduce_sum(sl_rc_loss * label_mask,
                                           reduction_indices=1)
                self.sl_rc_loss = tf.reduce_mean(sl_rc_loss)
                # used only for perpliexty calculation. Not used for optimzation
                self.rc_ppl = tf.exp(
                    tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask))
                """ as n-trial multimodal distribution. """

                kld = gaussian_kld(recog_mu, recog_logvar, prior_mu,
                                   prior_logvar)
                self.avg_kld = tf.reduce_mean(kld)
                if log_dir is not None:
                    kl_weights = tf.minimum(
                        tf.to_float(self.global_t) / config.full_kl_step, 1.0)
                else:
                    kl_weights = tf.constant(1.0)

                self.label_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        labels=latent_sample, logits=zlabel))

                self.kl_w = kl_weights
                self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld

                self.cvae_loss = self.elbo + +0.1 * self.label_loss
                self.sl_loss = self.sl_rc_loss

                tf.summary.scalar("rc_loss", self.avg_rc_loss)
                tf.summary.scalar("elbo", self.elbo)
                tf.summary.scalar("kld", self.avg_kld)

                self.summary_op = tf.summary.merge_all()

                self.log_p_z = norm_log_liklihood(latent_sample, prior_mu,
                                                  prior_logvar)
                self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu,
                                                     recog_logvar)
                self.est_marginal = tf.reduce_mean(rc_loss - self.log_p_z +
                                                   self.log_q_z_xy)

            self.train_sl_ops = self.optimize(sess,
                                              config,
                                              self.sl_loss,
                                              log_dir,
                                              scope="SL")
            self.train_ops = self.optimize(sess,
                                           config,
                                           self.cvae_loss,
                                           log_dir,
                                           scope="CVAE")

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemplo n.º 21
0
    def initialize(self, c_inputs, p_inputs, c_input_lengths, p_input_lengths, mel_targets=None, linear_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          c_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          p_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are phoneme IDs
          c_input_lengths and p_input_lenghts: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(c_inputs)[0]
            input_lengths = c_input_lengths+p_input_lengths #for concat character and phoneme
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            #
            c_embedded_inputs = tf.nn.embedding_lookup(embedding_table, c_inputs)  # [N, c_T_in, embed_depth=256]
            p_embedded_inputs = tf.nn.embedding_lookup(embedding_table, p_inputs)  # [N, p_T_in, embed_depth=256]
        with tf.variable_scope('Encoder') as scope:

            c_x = c_embedded_inputs
            p_x = p_embedded_inputs

            #3 Conv Layers
            for i in range(3):
                c_x = tf.layers.conv1d(c_x,filters=512,kernel_size=5,padding='same',activation=tf.nn.relu,name='c_Encoder_{}'.format(i))
                c_x = tf.layers.batch_normalization(c_x, training=is_training)
                c_x = tf.layers.dropout(c_x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
            c_encoder_conv_output = c_x

            for i in range(3):
                p_x = tf.layers.conv1d(p_x,filters=512,kernel_size=5,padding='same',activation=tf.nn.relu,name='p_Encoder_{}'.format(i))
                p_x = tf.layers.batch_normalization(p_x, training=is_training)
                p_x = tf.layers.dropout(p_x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
            p_encoder_conv_output = p_x
            
            #bi-directional LSTM
            cell_fw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM')
            cell_bw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM')
           
            c_outputs, c_states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, c_encoder_conv_output, sequence_length=c_input_lengths, dtype=tf.float32)
            p_outputs, p_states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, p_encoder_conv_output, sequence_length=p_input_lengths, dtype=tf.float32)

            # c_envoder_outpust = [N,c_T,2*encoder_lstm_units] = [N,c_T,512]
            c_encoder_outputs = tf.concat(c_outputs, axis=2) # Concat and return forward + backward outputs
            # p_envoder_outpust = [N,p_T,2*encoder_lstm_units] = [N,p_T,512]
            p_encoder_outputs = tf.concat(p_outputs, axis=2)
            # Concat and return character + phoneme = [N, c_T+p_T, 512]
            encoder_outputs = tf.concat([c_encoder_outputs, p_encoder_outputs], axis=1)
            # encoder_outputs = tf.cast(encoder_outputs, tf.float32)
        with tf.variable_scope('Decoder') as scope:
            
            if hp.attention_type == 'loc_sen': # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(128, encoder_outputs,hparams=hp, is_training=is_training,
                                    mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
            elif hp.attention_type == 'gmm': # GMM Attention
                attention_mechanism = GmmAttention(128, memory=encoder_outputs, memory_sequence_length = input_lengths) 
            elif hp.attention_type == 'step_bah':
                attention_mechanism = BahdanauStepwiseMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, mode="parallel")
            elif hp.attention_type == 'mon_bah':
                attention_mechanism = BahdanauMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, normalize=True)
            elif hp.attention_type == 'loung':
                attention_mechanism = LuongAttention(128, encoder_outputs, memory_sequence_length = input_lengths) 

            # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
            #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence.
            #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
            
            decoder_lstm = [ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i+1)) for i in range(2)]
            
            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True)
            # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음
            
            attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(dec_outputs,(hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(c_inputs, p_inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
                
            decoder_init_state = dec_outputs_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(decoder_outputs[:,:,:hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels])  # [N, T_out, M]
            #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters]
            
     # Postnet
            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i))
 
            residual = tf.layers.dense(x, hp.num_mels, name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)    # [N, T_out, F(1025)]
 
            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])  # batch_size, text length(encoder), target length(decoder)
 
			
            self.c_inputs = c_inputs
            self.p_inputs = p_inputs
            self.c_input_lengths = c_input_lengths
            self.p_input_lengths = p_input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            #self.stop_token_targets = stop_token_targets
            #self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  c_embedding:               %d' % c_embedded_inputs.shape[-1])
            log('  p_embedding:               %d' % p_embedded_inputs.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            #log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemplo n.º 22
0
def embedding_tied_rnn_seq2seq(encoder_inputs,
                               decoder_inputs,
                               cell,
                               num_symbols,
                               output_projection=None,
                               feed_previous=False,
                               dtype=dtypes.float32,
                               scope=None):
    """Embedding RNN sequence-to-sequence model with tied (shared) parameters.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded
  encoder_inputs into a state vector. Next, it embeds decoder_inputs using
  the same embedding. Then it runs RNN decoder, initialized with the last
  encoder state, on embedded decoder_inputs.

  Args:
    encoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    decoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    cell: RNNCell defining the cell function and size.
    num_symbols: integer; number of symbols for both encoder and decoder.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_symbols] and B has
      shape [num_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype to use for the initial RNN states (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_tied_rnn_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when output_projection has the wrong shape.
  """
    if output_projection is not None:
        proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
        proj_weights.get_shape().assert_is_compatible_with(
            [cell.output_size, num_symbols])
        proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
        proj_biases.get_shape().assert_is_compatible_with([num_symbols])

    with vs.variable_scope(scope or "embedding_tied_rnn_seq2seq"):
        with ops.device("/cpu:0"):
            embedding = vs.get_variable("embedding",
                                        [num_symbols, cell.input_size])

        emb_encoder_inputs = [
            embedding_ops.embedding_lookup(embedding, x)
            for x in encoder_inputs
        ]
        emb_decoder_inputs = [
            embedding_ops.embedding_lookup(embedding, x)
            for x in decoder_inputs
        ]

        def extract_argmax_and_embed(prev, _):
            """Loop_function that extracts the symbol from prev and embeds it."""
            if output_projection is not None:
                prev = nn_ops.xw_plus_b(prev, output_projection[0],
                                        output_projection[1])
            prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
            return embedding_ops.embedding_lookup(embedding, prev_symbol)

        if output_projection is None:
            cell = OutputProjectionWrapper(cell, num_symbols)

        if isinstance(feed_previous, bool):
            loop_function = extract_argmax_and_embed if feed_previous else None
            return tied_rnn_seq2seq(emb_encoder_inputs,
                                    emb_decoder_inputs,
                                    cell,
                                    loop_function=loop_function,
                                    dtype=dtype)
        else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
            outputs1, states1 = tied_rnn_seq2seq(
                emb_encoder_inputs,
                emb_decoder_inputs,
                cell,
                loop_function=extract_argmax_and_embed,
                dtype=dtype)
            vs.get_variable_scope().reuse_variables()
            outputs2, states2 = tied_rnn_seq2seq(emb_encoder_inputs,
                                                 emb_decoder_inputs,
                                                 cell,
                                                 dtype=dtype)

            outputs = control_flow_ops.cond(feed_previous, lambda: outputs1,
                                            lambda: outputs2)
            states = control_flow_ops.cond(feed_previous, lambda: states1,
                                           lambda: states2)
            return outputs, states
Exemplo n.º 23
0
    def __init__(self, sess, config, api, log_dir, forward, scope=None):

        self.vocab = api.vocab
        self.rev_vocab = api.rev_vocab
        self.vocab_size = len(self.vocab)

        self.sess = sess
        self.scope = scope
        self.max_utt_len = config.max_utt_len
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]

        self.sent_cell_size = config.sent_cell_size
        self.dec_cell_size = config.dec_cell_size
        self.attention_dim = config.attention_dim

        with tf.name_scope("io"):

            self.input_contexts = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None,
                                                        self.max_utt_len),
                                                 name="dialog_context")
            self.floors = tf.placeholder(dtype=tf.int32,
                                         shape=(None, None),
                                         name="floor")
            self.context_lens = tf.placeholder(dtype=tf.int32,
                                               shape=(None, ),
                                               name="context_lens")

            self.output_tokens = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None),
                                                name="output_token")
            self.output_lens = tf.placeholder(dtype=tf.int32,
                                              shape=(None, ),
                                              name="output_lens")

            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

        max_dialog_len = array_ops.shape(self.input_contexts)[1]
        max_out_len = array_ops.shape(self.output_tokens)[1]
        batch_size = array_ops.shape(self.input_contexts)[0]
        self.max_dialog_len = max_dialog_len

        with variable_scope.variable_scope("wordEmbedding"):
            self.embedding = tf.get_variable(
                "embedding", [self.vocab_size, config.embed_size],
                dtype=tf.float32)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            embedding = self.embedding * embedding_mask

            input_embedding = embedding_ops.embedding_lookup(
                embedding, tf.reshape(self.input_contexts, [-1]))
            input_embedding = tf.reshape(
                input_embedding, [-1, self.max_utt_len, config.embed_size])
            output_embedding = embedding_ops.embedding_lookup(
                embedding, self.output_tokens)

            if config.sent_type == "bow":
                input_embedding, sent_size = get_bow(input_embedding)
                output_embedding, _ = get_bow(output_embedding)

            elif config.sent_type == "rnn":
                sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                             config.keep_prob, 1)
                input_embedding, sent_size = get_rnn_encode(input_embedding,
                                                            sent_cell,
                                                            scope="sent_rnn")
                output_embedding, _ = get_rnn_encode(output_embedding,
                                                     sent_cell,
                                                     self.output_lens,
                                                     scope="sent_rnn",
                                                     reuse=True)
            elif config.sent_type == "bi_rnn":
                fwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                bwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                input_embedding, sent_size = get_bi_rnn_encode(
                    input_embedding,
                    fwd_sent_cell,
                    bwd_sent_cell,
                    scope="sent_bi_rnn")
                output_embedding, _ = get_bi_rnn_encode(output_embedding,
                                                        fwd_sent_cell,
                                                        bwd_sent_cell,
                                                        self.output_lens,
                                                        scope="sent_bi_rnn",
                                                        reuse=True)
            else:
                raise ValueError(
                    "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]")

            input_embedding = tf.reshape(input_embedding,
                                         [-1, max_dialog_len, sent_size])
            if config.keep_prob < 1.0:
                input_embedding = tf.nn.dropout(input_embedding,
                                                config.keep_prob)

            # convert floors into 1 hot
            floor_one_hot = tf.one_hot(tf.reshape(self.floors, [-1]),
                                       depth=2,
                                       dtype=tf.float32)
            floor_one_hot = tf.reshape(floor_one_hot, [-1, max_dialog_len, 2])

            joint_embedding = tf.concat([input_embedding, floor_one_hot], 2,
                                        "joint_embedding")
            # [batch_size, seq_len, dim]

            # outputs, alphas = self.attention(joint_embedding, sent_size + 2, self.attention_dim)
            # [batch_size, dim]

            attention_embedding = tf.reduce_mean(joint_embedding, 1)

        with variable_scope.variable_scope("generationNetwork"):

            context_embedding = attention_embedding
            context_vector = attention_embedding

            if config.num_layer > 1:
                dec_init_state = []
                for i in range(config.num_layer):
                    temp_init = layers.fully_connected(context_embedding,
                                                       self.dec_cell_size,
                                                       activation_fn=None,
                                                       scope="init_state-%d" %
                                                       i)
                    if config.cell_type == 'lstm':
                        temp_init = rnn_cell.LSTMStateTuple(
                            temp_init, temp_init)

                    dec_init_state.append(temp_init)

                dec_init_state = tuple(dec_init_state)
            else:
                dec_init_state = layers.fully_connected(context_embedding,
                                                        self.dec_cell_size,
                                                        activation_fn=None,
                                                        scope="init_state")
                if config.cell_type == 'lstm':
                    dec_init_state = rnn_cell.LSTMStateTuple(
                        dec_init_state, dec_init_state)

        with variable_scope.variable_scope("decoder"):
            dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size,
                                        config.keep_prob, config.num_layer)
            dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size)

            if forward:
                loop_func = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state,
                    embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=context_vector)

                dec_input_embedding = None
                dec_seq_lens = None
            else:

                loop_func = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state, context_vector)
                dec_input_embedding = embedding_ops.embedding_lookup(
                    embedding, self.output_tokens)
                dec_input_embedding = dec_input_embedding[:, 0:-1, :]
                dec_seq_lens = self.output_lens - 1

                if config.keep_prob < 1.0:
                    dec_input_embedding = tf.nn.dropout(
                        dec_input_embedding, config.keep_prob)

                # apply word dropping. Set dropped word to 0
                if config.dec_keep_prob < 1.0:
                    keep_mask = tf.less_equal(
                        tf.random_uniform((batch_size, max_out_len - 1),
                                          minval=0.0,
                                          maxval=1.0), config.dec_keep_prob)
                    keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2)
                    dec_input_embedding = dec_input_embedding * keep_mask
                    dec_input_embedding = tf.reshape(
                        dec_input_embedding,
                        [-1, max_out_len - 1, config.embed_size])

            dec_outs, _, final_context_state = dynamic_rnn_decoder(
                dec_cell,
                loop_func,
                inputs=dec_input_embedding,
                sequence_length=dec_seq_lens)
            if final_context_state is not None:
                final_context_state = final_context_state[:, 0:array_ops.
                                                          shape(dec_outs)[1]]
                mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2)))
                self.dec_out_words = tf.multiply(
                    tf.reverse(final_context_state, axis=[1]), mask)
            else:
                self.dec_out_words = tf.argmax(dec_outs, 2)

        if not forward:
            with variable_scope.variable_scope("loss"):
                labels = self.output_tokens[:, 1:]
                label_mask = tf.to_float(tf.sign(labels))

                rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_outs, labels=labels)
                rc_loss = tf.reduce_sum(rc_loss * label_mask,
                                        reduction_indices=1)
                self.avg_rc_loss = tf.reduce_mean(rc_loss)

                self.rc_ppl = tf.exp(
                    tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask))

                aug_elbo = self.avg_rc_loss

                tf.summary.scalar("rc_loss", self.avg_rc_loss)
                self.summary_op = tf.summary.merge_all()

            self.optimize(sess, config, aug_elbo, log_dir)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemplo n.º 24
0
    def initialize(self, txt_targets, txt_lengths, mel_targets, image_targets):
        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None
            is_teacher_force_generating = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings for text
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_txt_inputs = tf.nn.embedding_lookup(
                embedding_table, txt_targets)  # [N, T_in, 256]

            # Text Encoder
            prenet_outputs = prenet(embedded_txt_inputs,
                                    is_training)  # [N, T_in, 128]
            txt_encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                               is_training)  # [N, T_in, 256]
            self.z_txt

            # Speech Encoder
            speech_outputs = reference_encoder(
                mel_targets,
                filters=hp.reference_filters,
                kernel_size=(3, 3),
                strides=(2, 2),
                encoder_cell=GRUCell(hp.reference_depth),
                is_training=is_training)  # [N, 256]
            self.z_speech = speech_outputs

            # Image Encoder
            img_outputs = image_encoder('E',
                                        is_training=is_training,
                                        norm='batch',
                                        image_size=128)
            self.z_img = img_outputs

            def global_body(self, input):
                # Global computing body (share weights)
                # information fusion encoder
                self.z_fuse = info_encoder(input)  # [N, 1, 256]
                # Global  tokens (GST)
                gst_tokens = tf.get_variable(
                    'global_tokens',
                    [hp.num_gst, hp.embed_depth // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

                # Attention
                attention = MultiheadAttention(
                    tf.expand_dims(z_fuse, axis=1),  # [N, 1, 256]
                    tf.tanh(
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                    num_heads=hp.num_heads,
                    num_units=hp.style_att_dim,
                    attention_type=hp.style_att_type)

                output = attention.multi_head_attention()  # [N, 1, 256]
                self.uni_embedding = output
                return self.uni_embedding

            # Domain classification network
            domain_logit_txt = domain_classifier('D',
                                                 is_training=is_training,
                                                 norm='batch',
                                                 info_encoder(self.z_txt))

            domain_logit_img = domain_classifier('D',
                                                 is_training=is_training,
                                                 norm='batch',
                                                 info_encoder(self.z_img))

            domain_logit_speech = domain_classifier('D',
                                                    is_training=is_training,
                                                    norm='batch',
                                                    info_encoder(
                                                        self.z_speech))

        # out of inference scope
        # Add style embedding to every text encoder state

        # Text Decoder scope
        with tf.variable_scope('text_decoder') as scope:

            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth,
                                  uni_embeddings,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            output_cell = OutputProjectionWrapper(decoder_cell,
                                                  hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            decoder_outputs, _ = tf.nn.dynamic_rnn(
                cell=output_cell,
                initial_state=decoder_init_state,
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]
            with tf.variable_scope('text_logits') as scope:
                txt_logit = tf.contrib.layers.fully_connected(
                    inputs=decoder_outputs,
                    num_outputs=self.config.vocab_size,
                    activation_fn=None,
                    weights_initializer=self.initializer,
                    scope=logits_scope)

        # Image Decoder scope
        with tf.variable_scope('image_decoder') as scope:
            G = Generator('G',
                          is_train=self.is_training,
                          norm='batch',
                          image_size=128)
            fake_img = G(uni_embeddings)

        # Speech Decoder scope
        with tf.variable_scope('speech_decoder') as scope:
            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth,
                                  uni_embeddings,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            fake_mel = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

        self.txt_targets = txt_targets
        self.txt_lengths = txt_lengths
        self.mel_targets = mel_targets
        self.image_targets = image_targets
        self.txt_targets = txt_targets
        self.txt_logit = txt_logit
        self.fake_mel = fake_mel
        self.fake_img = fake_img