예제 #1
0
def decoder(inputs, encoder_outputs, is_training, batch_size, mel_targets):
    """ Decoder
  
  Prenet -> Attention RNN
  Postprocessing CBHG

  @param    encoder_outputs   outputs from the encoder wtih shape [N, T_in, prenet_depth=256]
  @param    inputs              int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
                                steps in the input time series, and values are character IDs
  @param    is_training         flag for training or eval
  @param    batch_size          number of samples per batch
  @param    mel_targets         float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
                                of steps in the output time series, M is num_mels, and values are entries in the mel
  @param    output_cell         attention cell
  @param    decoder_init_state  initial state of the decoder

  @return                       linear_outputs, mel_outputs and alignments
  """

    if (is_training):
        helper = TacoTrainingHelper(inputs, mel_targets, hparams.num_mels,
                                    hparams.outputs_per_step)
    else:
        helper = TacoTestHelper(batch_size, hparams.num_mels,
                                hparams.outputs_per_step)

    # Attention
    attention_cell = AttentionWrapper(
        GRUCell(hparams.attention_depth),
        BahdanauAttention(hparams.attention_depth, encoder_outputs),
        alignment_history=True,
        output_attention=False)  # [N, T_in, attention_depth=256]

    # Apply prenet before concatenation in AttentionWrapper.
    attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                          hparams.prenet_depths)

    # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
    concat_cell = ConcatOutputAndAttentionWrapper(
        attention_cell)  # [N, T_in, 2*attention_depth=512]

    # Decoder (layers specified bottom to top):
    decoder_cell = MultiRNNCell(
        [
            OutputProjectionWrapper(concat_cell, hparams.decoder_depth),
            ResidualWrapper(GRUCell(hparams.decoder_depth)),
            ResidualWrapper(GRUCell(hparams.decoder_depth))
        ],
        state_is_tuple=True)  # [N, T_in, decoder_depth=256]

    # Project onto r mel spectrograms (predict r outputs at each RNN step):
    output_cell = OutputProjectionWrapper(
        decoder_cell, hparams.num_mels * hparams.outputs_per_step)

    decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                dtype=tf.float32)

    (decoder_outputs,
     _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
         BasicDecoder(output_cell, helper, decoder_init_state),
         maximum_iterations=hparams.max_iters)  # [N, T_out/r, M*r]

    # Reshape outputs to be one output per entry
    mel_outputs = tf.reshape(
        decoder_outputs, [batch_size, -1, hparams.num_mels])  # [N, T_out, M]

    # Add post-processing CBHG:
    post_outputs = post_cbhg(
        mel_outputs,
        hparams.num_mels,
        is_training,  # [N, T_out, postnet_depth=256]
        hparams.postnet_depth)
    linear_outputs = tf.layers.dense(post_outputs,
                                     hparams.num_freq)  # [N, T_out, F]

    # Grab alignments from the final decoder state:
    alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(),
                              [1, 2, 0])

    log('Decoder Network ...')
    log('  attention out:             %d' % attention_cell.output_size)
    log('  concat attn & out:         %d' % concat_cell.output_size)
    log('  decoder cell out:          %d' % decoder_cell.output_size)
    log('  decoder out (%d frames):   %d' %
        (hparams.outputs_per_step, decoder_outputs.shape[-1]))
    log('  decoder out (1 frame):     %d' % mel_outputs.shape[-1])
    log('  postnet out:               %d' % post_outputs.shape[-1])
    log('  linear out:                %d' % linear_outputs.shape[-1])

    return linear_outputs, mel_outputs, alignments
예제 #2
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        """
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_gru(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                gru_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            cells = [
                GRUCell(hp.decoder_gru_units)
                for _ in range(hp.decoder_gru_layers)
            ]
            decoder_cell = MultiRNNCell(
                [concat_cell] + cells,
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                if hp.scheduled_sampling:
                    helper = TacoScheduledOutputTrainingHelper(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step, hp.scheduled_sampling_probability)
                else:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add Post-Processing Conv and GRU layer:
            expand_outputs = conv_and_gru(  # [N, T_in, 2*expand_gru_units=512]
                pml_intermediates,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                gru_units_unidirectional=hp.expand_gru_units,
                is_training=is_training,
                scope='expand',
            )

            pml_outputs = tf.layers.dense(expand_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(is_training))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Expand out:              {}'.format(
                    expand_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
예제 #3
0
  def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None, global_step=None):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      embedding_table = tf.get_variable(
        'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))
      embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)                            # [N, T_in, embed_depth=256]

      # Encoder
      prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths)                      # [N, T_in, prenet_depths[-1]=128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth) # [N, T_in, encoder_depth=256]

      # Location sensitive attention
      attention_mechanism = LocationSensitiveAttention(hp.attention_depth, encoder_outputs)        # [N, T_in, attention_depth=256]

      # Decoder (layers specified bottom to top):
      multi_rnn_cell = MultiRNNCell([
          ResidualWrapper(GRUCell(hp.decoder_depth)),
          ResidualWrapper(GRUCell(hp.decoder_depth))
        ], state_is_tuple=True)                                                                    # [N, T_in, decoder_depth=256]

      # Frames Projection layer
      frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step)                        # [N, T_out/r, M*r]

      # <stop_token> projection layer
      stop_projection = StopProjection(is_training, shape=hp.outputs_per_step)                     # [N, T_out/r, r]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell,
                                            frame_projection, stop_projection)

      if is_training:
        helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step, global_step)
      else:
        helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

      decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
         CustomDecoder(decoder_cell, helper, decoder_init_state),
         maximum_iterations=hp.max_frame_num)                                                          # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])                     # [N, T_out, M]
      stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1])                        # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth)            # [N, T_out, postnet_depth=256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)                                  # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.linear_outputs = linear_outputs
      self.stop_token_outputs = stop_token_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      self.stop_token_targets = stop_token_targets
      log('Initialized Tacotron model. Dimensions: ')
      log('  embedding:               {}'.format(embedded_inputs.shape))
      log('  prenet out:              {}'.format(prenet_outputs.shape))
      log('  encoder out:             {}'.format(encoder_outputs.shape))
      log('  decoder out (r frames):  {}'.format(decoder_outputs.shape))
      log('  decoder out (1 frame):   {}'.format(mel_outputs.shape))
      log('  postnet out:             {}'.format(post_outputs.shape))
      log('  linear out:              {}'.format(linear_outputs.shape))
      log('  stop token:              {}'.format(stop_token_outputs.shape))
예제 #4
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            is_teacher_force_generating = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), 256],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            if hp.use_gst:
                #Global style tokens (GST)
                gst_tokens = tf.get_variable(
                    'style_tokens', [hp.num_gst, 256 // hp.num_heads],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                self.gst_tokens = gst_tokens

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=[32, 32, 64, 64, 128, 128],
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(128),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                if hp.use_gst:
                    # Style attention
                    style_attention = MultiheadAttention(
                        tf.expand_dims(refnet_outputs, axis=1),  # [N, 1, 128]
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ]),  # [N, hp.num_gst, 256/hp.num_heads]   
                        num_heads=hp.num_heads,
                        num_units=128,
                        attention_type=hp.style_att_type)

                    # Apply tanh to compress both encoder state and style embedding to the same scale.
                    style_embeddings = tf.nn.tanh(
                        style_attention.multi_head_attention())  # [N, 1, 256]
                else:
                    style_embeddings = tf.expand_dims(refnet_outputs,
                                                      axis=1)  # [N, 1, 128]
            else:
                #raise ValueError("TODO: add weight when there is no reference during inference")
                print("Use random weight for GST.")
                random_weights = tf.random_uniform([hp.num_heads, hp.num_gst],
                                                   maxval=1.0,
                                                   dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               name="random_weights")
                style_embeddings = tf.nn.tanh(
                    tf.matmul(random_weights, gst_tokens))
                style_embeddings = tf.reshape(
                    style_embeddings, [1, 1] +
                    [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                style_embeddings,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            encoder_outputs = tf.concat([encoder_outputs, style_embeddings],
                                        axis=-1)

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256,
                                  encoder_outputs,
                                  memory_sequence_length=input_lengths),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or is_teacher_force_generating:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
            log('Initialized Tacotron model. Dimensions: ')
            log('  text embedding:          %d' % embedded_inputs.shape[-1])
            log('  style embedding:         %d' % style_embeddings.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
예제 #5
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id,
                   mel_targets=None,
                   linear_targets=None,
                   loss_coeff=None,
                   rnn_decoder_test_mode=False,
                   is_randomly_initialized=False):

        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(speaker_id,
                                                   self.num_speakers,
                                                   hp.enc_prenet_sizes[-1],
                                                   "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly

                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None

                else:
                    raise Exception(
                        " [!] Unknown multi-speaker model type: {}".format(
                            hp.model_type))

            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.dec_rnn_size),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            # [N, T_out, postnet_depth=256]
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
예제 #6
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   identities=None,
                   id_num=0):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
            inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
                steps in the input time series, and values are character IDs
            input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
                of each sequence in inputs.
            mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
                of steps in the output time series, M is num_mels, and values are entries in the mel
                spectrogram. Only needed for training.
            linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
                of steps in the output time series, F is num_freq, and values are entries in the linear
                spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            # Embeddings
            embedding_num = len(symbols2)

            embedding_text_table = tf.get_variable(
                'embedding', [embedding_num, hp.embedding_text_channels],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            embedded_text_inputs = tf.nn.embedding_lookup(
                embedding_text_table, inputs)  # [N, T_in, 256]

            if identities is not None and id_num > 1:
                embedding_id_table = tf.get_variable(
                    'embedding_id', [id_num, hp.embedding_id_channels],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                embedded_id_inputs = tf.nn.embedding_lookup(
                    embedding_id_table, identities)  # [N, 64]
                embedded_id_inputs = tf.expand_dims(embedded_id_inputs,
                                                    1)  # [N, 1, 32]
                embedded_id_inputs = tf.tile(embedded_id_inputs,
                                             [1, tf.shape(inputs)[1], 1],
                                             name=None)  # [N, T_in, 32]
                embedded_inputs = tf.concat(
                    [embedded_text_inputs, embedded_id_inputs],
                    2)  # [N, T_in, 288]
                log('multi-speaker')
            else:
                embedded_inputs = embedded_text_inputs
                log('single speaker')

            # Encoder
            prenet_outputs = prenet(embedded_inputs,
                                    is_training)  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                print('training')
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.identities = identities
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('embedding:                 %d' % embedded_inputs.shape[-1])
            log('prenet out:                %d' % prenet_outputs.shape[-1])
            log('encoder out:               %d' % encoder_outputs.shape[-1])
            log('attention out:             %d' % attention_cell.output_size)
            log('concat attn & out:         %d' % concat_cell.output_size)
            log('decoder cell out:          %d' % decoder_cell.output_size)
            log('decoder out (%d frames):   %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('decoder out (1 frame):     %d' % mel_outputs.shape[-1])
            log('postnet out:               %d' % post_outputs.shape[-1])
            log('linear out:                %d' % linear_outputs.shape[-1])
예제 #7
0
    def presentation_transformer(self, inputs, inputs_actual_length):
        with tf.variable_scope('presentation_layer', reuse=tf.AUTO_REUSE):
            with tf.name_scope('structure_presentation_layer'):
                # 正向
                fw_cell = GRUCell(num_units=self.hidden_num)
                fw_drop_cell = DropoutWrapper(fw_cell,
                                              output_keep_prob=self.keep_prob)
                # 反向
                bw_cell = GRUCell(num_units=self.hidden_num)
                bw_drop_cell = DropoutWrapper(bw_cell,
                                              output_keep_prob=self.keep_prob)

                # 动态rnn函数传入的是一个三维张量,[batch_size,n_steps,n_input]  输出是一个元组 每一个元素也是这种形状
                if self.is_train and not self.is_extract:
                    output, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=fw_drop_cell,
                        cell_bw=bw_drop_cell,
                        inputs=inputs,
                        sequence_length=inputs_actual_length,
                        dtype=tf.float32)
                else:
                    output, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=fw_cell,
                        cell_bw=bw_cell,
                        inputs=inputs,
                        sequence_length=inputs_actual_length,
                        dtype=tf.float32)

                # hiddens的长度为2,其中每一个元素代表一个方向的隐藏状态序列,将每一时刻的输出合并成一个输出
                structure_output = tf.concat(output, axis=2)
                structure_output = self.layer_normalization(structure_output)

            with tf.name_scope('transformer_layer'):
                transformer_output = self.encoder_stack(
                    structure_output, self.is_train)

            with tf.name_scope('global_attention_layer'):
                w_omega = tf.get_variable(
                    name='w_omega',
                    shape=[self.hidden_num * 2, self.attention_num],
                    initializer=tf.random_normal_initializer())
                b_omega = tf.get_variable(
                    name='b_omega',
                    shape=[self.attention_num],
                    initializer=tf.random_normal_initializer())
                u_omega = tf.get_variable(
                    name='u_omega',
                    shape=[self.attention_num],
                    initializer=tf.random_normal_initializer())

                v = tf.tanh(
                    tf.tensordot(transformer_output, w_omega, axes=1) +
                    b_omega)

                vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
                alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape

                # tf.expand_dims用于在指定维度增加一维
                global_attention_output = tf.reduce_sum(
                    transformer_output * tf.expand_dims(alphas, -1), 1)

        return global_attention_output
예제 #8
0
파일: tacotron.py 프로젝트: MLCogUP/nspeech
    def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          speaker_ids: int32 Tensor containing ids of specific speakers
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference'):
            is_training = linear_targets is not None
            batch_size = tf.shape(text_inputs)[0]
            hp = self._hparams
            vocab_size = len(symbols)
            embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim)  # [N, T_in, embd_size]

            # extract speaker embedding if multi-speaker
            with tf.variable_scope('speaker'):
                if hp.num_speakers > 1:
                    speaker_embedding = tf.get_variable('speaker_embed',
                                                        shape=(hp.num_speakers, hp.speaker_embed_dim),
                                                        dtype=tf.float32)
                    # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)?
                    speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids)
                else:
                    speaker_embd = None
            # Encoder
            prenet_outputs = prenet(inputs=embedded_inputs,
                                    drop_rate=hp.drop_rate if is_training else 0.0,
                                    is_training=is_training,
                                    layer_sizes=hp.encoder_prenet,
                                    scope="prenet")  # [N, T_in, 128]
            encoder_outputs = cbhg(prenet_outputs, input_lengths,
                                   speaker_embd=speaker_embd,
                                   is_training=is_training,
                                   K=hp.encoder_cbhg_banks,
                                   c=hp.encoder_cbhg_bank_sizes,  # [N, T_in, 256]
                                   scope='encoder_cbhg')

            # Attention Mechanism
            attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training,
                                               speaker_embd=speaker_embd, attention_type=hp.attention_type)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(attention_cell, hp.decoder_dim),  # 256
                ResidualWrapper(GRUCell(hp.decoder_dim)),  # 256
                ResidualWrapper(GRUCell(hp.decoder_dim))  # 256
            ], state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell, helper, decoder_init_state),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing
            post_outputs = cbhg(mel_outputs, None,
                                speaker_embd=None,
                                is_training=is_training,
                                K=hp.post_cbhg_banks,
                                c=hp.post_cbhg_bank_sizes + [hp.num_mels],
                                scope='post_cbhg')  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = text_inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.audio = audio.inv_spectrogram_tensorflow(linear_outputs)
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            # TODO: later work around for getting info back?
            # log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % attention_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
예제 #9
0
def Generator_GRU_CL_VL_TH(n_samples, charmap_len, seq_len=None, gt=None):
    with tf.variable_scope("Generator"):
        noise, noise_shape = get_noise()
        num_neurons = FLAGS.GEN_STATE_SIZE

        cells = []
        for l in range(FLAGS.GEN_GRU_LAYERS):
            cells.append(GRUCell(num_neurons))

        # this is separate to decouple train and test
        train_initial_states = create_initial_states(noise)
        inference_initial_states = create_initial_states(noise)

        sm_weight = tf.Variable(
            tf.random_uniform([num_neurons, charmap_len],
                              minval=-0.1,
                              maxval=0.1))
        sm_bias = tf.Variable(
            tf.random_uniform([charmap_len], minval=-0.1, maxval=0.1))

        embedding = tf.Variable(
            tf.random_uniform([charmap_len, num_neurons],
                              minval=-0.1,
                              maxval=0.1))

        char_input = tf.Variable(
            tf.random_uniform([num_neurons], minval=-0.1, maxval=0.1))
        char_input = tf.reshape(tf.tile(char_input, [n_samples]),
                                [n_samples, 1, num_neurons])

        if seq_len is None:
            seq_len = tf.placeholder(tf.int32,
                                     None,
                                     name="ground_truth_sequence_length")

        if gt is not None:  # if no GT, we are training
            train_pred = get_train_op(cells, char_input, charmap_len,
                                      embedding, gt, n_samples, num_neurons,
                                      seq_len, sm_bias, sm_weight,
                                      train_initial_states)
            inference_op = get_inference_op(cells,
                                            char_input,
                                            embedding,
                                            seq_len,
                                            sm_bias,
                                            sm_weight,
                                            inference_initial_states,
                                            num_neurons,
                                            charmap_len,
                                            reuse=True)
        else:
            inference_op = get_inference_op(cells,
                                            char_input,
                                            embedding,
                                            seq_len,
                                            sm_bias,
                                            sm_weight,
                                            inference_initial_states,
                                            num_neurons,
                                            charmap_len,
                                            reuse=False)
            train_pred = None

        return train_pred, inference_op
예제 #10
0
    def __init__(self,
                 src_vocab_sz,
                 tgt_vocab_sz,
                 size,
                 batch_size,
                 learn_rate,
                 train=True):
        """ Constructor for the Seq2SeqModel.
        Args:
          src_vocab_size: Number of source vocab tokens.
          tgt_vocab_size: Number of target vocab tokens.
          size: Size of each model layer.
          batch_size: Size of each training batch.
          learn_rate: Learning rate.
          train: Whether or not the model is for training.
    """
        self.PAD_ID = 0
        self.EOS_ID = 1
        self.src_vocab_sz = src_vocab_sz
        self.tgt_vocab_sz = tgt_vocab_sz
        self.embed_size = size
        self.enc_cell = GRUCell(size)
        self.dec_cell = GRUCell(size * 2)
        self.train = train

        # Initialize placeholders
        self.enc_inputs = tf.placeholder(shape=(None, None),
                                         dtype=tf.int32,
                                         name="enc_inputs")
        self.enc_inputs_len = tf.placeholder(shape=(None, ),
                                             dtype=tf.int32,
                                             name="enc_inputs_len")
        self.dec_targets = tf.placeholder(shape=(None, None),
                                          dtype=tf.int32,
                                          name="dec_targets")

        # Create embedding matrices
        self.src_embed_matrix = tf.Variable(tf.random_uniform(
            [self.src_vocab_sz, self.embed_size], 1.0, 1.0),
                                            dtype=tf.float32)
        self.tgt_embed_matrix = tf.Variable(tf.random_uniform(
            [self.tgt_vocab_sz, self.embed_size], 1.0, 1.0),
                                            dtype=tf.float32)

        # Prepare the encoder
        self.enc_inputs_embedded = tf.nn.embedding_lookup(
            self.src_embed_matrix, self.enc_inputs)
        enc_outputs, enc_output_state = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=self.enc_cell,
            cell_bw=self.enc_cell,
            inputs=self.enc_inputs_embedded,
            sequence_length=self.enc_inputs_len,
            dtype=tf.float32,
            time_major=True)
        self.enc_outputs = tf.concat(enc_outputs, 2)
        self.enc_state = tf.concat(enc_output_state, 1)

        # Prepare the decoder
        self.enc_max_time, self.batch_sz = tf.unstack(tf.shape(
            self.enc_inputs))
        self.dec_len = self.enc_inputs_len
        self.W = tf.Variable(tf.random_uniform([size * 2, tgt_vocab_sz], -1,
                                               1),
                             dtype=tf.float32)
        self.b = tf.Variable(tf.zeros([tgt_vocab_sz]), dtype=tf.float32)
        self.pad_slice = tf.zeros([self.batch_sz], dtype=tf.int32)
        self.eos_slice = tf.ones([self.batch_sz], dtype=tf.int32)
        self.pad_step_embedded = tf.nn.embedding_lookup(
            self.tgt_embed_matrix, self.pad_slice)
        self.eos_step_embedded = tf.nn.embedding_lookup(
            self.tgt_embed_matrix, self.eos_slice)

        def loop_fn(time, prev_output, prev_state, prev_loop_state):
            if prev_state == None:
                elems_finished = (0 >= self.dec_len)
                _input = self.eos_step_embedded
                cell_state = self.enc_state
                return (elems_finished, _input, cell_state, None, None)
            else:

                def get_next_input():
                    out_logits = tf.add(tf.matmul(prev_output, self.W), self.b)
                    pred = tf.argmax(out_logits, axis=1)
                    return tf.nn.embedding_lookup(self.src_embed_matrix, pred)

                elems_finished = (time >= self.dec_len)
                finished_cond = tf.reduce_all(elems_finished)
                _input = tf.cond(finished_cond, lambda: self.pad_step_embedded,
                                 get_next_input)
                cell_state = prev_state
                output = prev_output
                loop_state = None
                return (elems_finished, _input, cell_state, output, loop_state)

        self.loop_function = loop_fn
        dec_outputs_ta, dec_state, _ = tf.nn.raw_rnn(self.dec_cell, loop_fn)
        self.dec_outputs = dec_outputs_ta.stack()
        self.dec_state = dec_state
        dec_max_time, dec_batch_sz, dec_dim = tf.unstack(
            tf.shape(self.dec_outputs))
        dec_outputs_flat = tf.reshape(self.dec_outputs, (-1, dec_dim))
        dec_logits_flat = tf.add(tf.matmul(dec_outputs_flat, self.W), self.b)
        self.dec_logits = tf.reshape(
            dec_logits_flat, (dec_max_time, dec_batch_sz, self.tgt_vocab_sz))
        self.dec_prediction = tf.argmax(self.dec_logits, 2)

        # Prepare the optimizer if training
        if self.train:
            stepwise_crossent = tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.one_hot(self.dec_targets,
                                  depth=self.tgt_vocab_sz,
                                  dtype=tf.float32),
                logits=self.dec_logits)
            self.loss = tf.reduce_mean(stepwise_crossent)
            self.opt = tf.train.AdamOptimizer(learn_rate).minimize(self.loss)

        self.saver = tf.train.Saver(tf.global_variables())
예제 #11
0
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH],
                              name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform(
        [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0),
                                 trainable=True)
    tf.summary.histogram('embeddings_var', embeddings_var)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

# (Bi-)RNN layer(-s)
rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE),
                        GRUCell(HIDDEN_SIZE),
                        inputs=batch_embedded,
                        sequence_length=seq_len_ph,
                        dtype=tf.float32)
tf.summary.histogram('RNN_outputs', rnn_outputs)
# rnn_outputs_shape: [fw_cell, bw_cell], 其中fw_cell.shape = bw_cell.shape = [batch_size, seq_len, n_hidden]

# Attention layer
with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(rnn_outputs,
                                         ATTENTION_SIZE,
                                         return_alphas=True)
    tf.summary.histogram('alphas', alphas)

# Dropout
예제 #12
0
    def __init__(self, embedding_size, init_embed, hidden_size, \
                 attention_size, max_sent_len, keep_prob, just_embed = True):
        # training inputs
        self.input_x = tf.placeholder(tf.int32, [None, None, max_sent_len],
                                      name="input_x")
        self.sequence_length = tf.placeholder(tf.int32, [None, None],
                                              name="input_len")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        self.input_em = tf.placeholder(tf.int32, [None, max_sent_len],
                                       name="input_x_em")
        self.sequence_len_em = tf.placeholder(tf.int32, [None],
                                              name="input_len_em")
        self.is_train = tf.placeholder(tf.int32, (), name="is_train")
        with tf.variable_scope('siamese_discriminator'):
            # embedding layer with initialization
            batch_size = tf.shape(self.input_x)[1]
            num_classes = tf.shape(self.input_x)[0]
            with tf.name_scope("pair_inps"):
                self.input, self.sequence_len, self.labels = self.all_class_flattener(
                    self.input_x, self.sequence_length, self.is_train)
            with tf.name_scope("flatten_input"):
                self.inter_inp = self.merge_sents(self.input)
                self.inner_lens = tf.reshape(self.sequence_len,
                                             [num_classes * batch_size * 4])
            with tf.name_scope("embedding"):
                # trainable embedding
                W = tf.Variable(init_embed, name="W", dtype=tf.float32)
                self.embedded_chars = tf.nn.embedding_lookup(W, self.inter_inp)
                self.embedded_chars_em = tf.nn.embedding_lookup(
                    W, self.input_em)
            # RNN layer + attention
            with tf.name_scope("bi-rnn"):
                self.gru1 = GRUCell(hidden_size)
                self.gru2 = GRUCell(hidden_size)
                rnn_outputs, _ = bi_rnn(self.gru1, self.gru2 ,\
                                        inputs=self.embedded_chars, sequence_length=self.inner_lens, \
                                        dtype=tf.float32)
                rnn_outputs_em, _ = bi_rnn(self.gru1, self.gru2 ,\
                                        inputs=self.embedded_chars_em, sequence_length=self.sequence_len_em , \
                                        dtype=tf.float32)

                self.attention_outputs, self.alphas = attention(
                    rnn_outputs, attention_size, return_alphas=True)

                self.attention_outputs_em, self.alphas_em = attention(
                    rnn_outputs_em, attention_size, return_alphas=True)
                self.output_em = tf.reduce_mean(self.attention_outputs_em,
                                                axis=0)
                drop_outputs = tf.nn.dropout(self.attention_outputs, keep_prob)
            with tf.name_scope('flattener'):
                self.drop_outputs = tf.reshape(
                    drop_outputs,
                    (num_classes * batch_size * 2, 2, -1))  #b,2,d

            with tf.name_scope('similarity_measure'):
                #
                self.d1 = d1 = self.distance(self.drop_outputs[:, 0],
                                             self.drop_outputs[:, 1])
                loss = self.labels * tf.square(d1) + (
                    1 - self.labels) * tf.square(tf.maximum((1 - d1), 0))
                self.loss = tf.div(tf.reduce_mean(loss), 2)
            with tf.name_scope("accuracy"):
                self.temp_sim = tf.subtract(
                    tf.ones_like(self.d1), tf.rint(self.d1),
                    name="temp_sim")  #auto threshold 0.5
                correct_predictions = tf.equal(self.temp_sim, self.labels)
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")

        self.params = [
            param for param in tf.trainable_variables()
            if 'siamese_discriminator' in param.name
        ]
        for param in self.params:
            print(param.name)
        sd_optimizer = tf.train.AdamOptimizer(1e-4)
        grads_and_vars = sd_optimizer.compute_gradients(self.loss,
                                                        self.params,
                                                        aggregation_method=2)
        self.train_op = sd_optimizer.apply_gradients(grads_and_vars)
예제 #13
0
def rnn_layers(x,
               seq_length,
               training,
               hidden_num=100,
               layer_num=3,
               class_n=5,
               cell='LSTM',
               dtype=tf.float32):
    """Generate RNN layers.

    Args:
        x (Float): A 3D-Tensor of shape [batch_size,max_time,channel]
        seq_length (Int): A 1D-Tensor of shape [batch_size], real length of each sequence.
        training (Boolean): A 0D-Tenosr indicate if it's in training.
        hidden_num (int, optional): Defaults to 100. Size of the hidden state, 
            hidden unit will be deep concatenated, so the final hidden state will be size of 200.
        layer_num (int, optional): Defaults to 3. Number of layers in RNN.
        class_n (int, optional): Defaults to 5. Number of output class.
        cell(str): A String from 'LSTM','GRU','BNLSTM', the RNN Cell used. 
            BNLSTM stand for Batch normalization LSTM Cell.

    Returns:
         logits: A 3D Tensor of shape [batch_size, max_time, class_n]
    """

    cells_fw = list()
    cells_bw = list()
    for i in range(layer_num):
        if cell == 'LSTM':
            cell_fw = LSTMCell(hidden_num)
            cell_bw = LSTMCell(hidden_num)
        elif cell == 'GRU':
            cell_fw = GRUCell(hidden_num)
            cell_bw = GRUCell(hidden_num)
        elif cell == 'BNLSTM':
            cell_fw = BNLSTMCell(hidden_num, training=training)
            cell_bw = BNLSTMCell(hidden_num, training=training)
        else:
            raise ValueError("Cell type unrecognized.")
        cells_fw.append(cell_fw)
        cells_bw.append(cell_bw)
    multi_cells_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw)
    multi_cells_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw)
    with tf.variable_scope('BDGRU_rnn') as scope:
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=multi_cells_fw,
            cell_bw=multi_cells_bw,
            inputs=x,
            sequence_length=seq_length,
            dtype=dtype,
            scope=scope)
        lasth = tf.concat(outputs, 2, name='birnn_output_concat')
    # shape of lasth [batch_size,max_time,hidden_num*2]
    batch_size = tf.shape(lasth)[0]
    max_time = lasth.get_shape().as_list()[1]
    with tf.variable_scope('rnn_fnn_layer'):
        weight_out = _variable_on_cpu(
            name='weights',
            shape=[2, hidden_num],
            initializer=tf.truncated_normal_initializer(
                stddev=np.sqrt(2.0 / (2 * hidden_num))),
            dtype=dtype)
        biases_out = _variable_on_cpu(name='bias',
                                      shape=[hidden_num],
                                      initializer=tf.zeros_initializer(),
                                      dtype=dtype)
        weight_class = _variable_on_cpu(
            name='weights_class',
            shape=[hidden_num, class_n],
            initializer=tf.truncated_normal_initializer(
                stddev=np.sqrt(2.0 / hidden_num)),
            dtype=dtype)
        bias_class = _variable_on_cpu(name='bias_class',
                                      shape=[class_n],
                                      initializer=tf.zeros_initializer(),
                                      dtype=dtype)
        lasth_rs = tf.reshape(lasth, [batch_size, max_time, 2, hidden_num],
                              name='lasth_rs')
        lasth_output = tf.nn.bias_add(tf.reduce_sum(tf.multiply(
            lasth_rs, weight_out),
                                                    axis=2),
                                      biases_out,
                                      name='lasth_bias_add')
        lasth_output_rs = tf.reshape(lasth_output,
                                     [batch_size * max_time, hidden_num],
                                     name='lasto_rs')
        logits = tf.reshape(tf.nn.bias_add(
            tf.matmul(lasth_output_rs, weight_class), bias_class),
                            [batch_size, max_time, class_n],
                            name="rnn_logits_rs")
    return logits
예제 #14
0
def CBHG(
    input_Pattern,
    input_Length,
    scope,
    is_Training,
    conv_Bank_Filter_Count=128,
    conv_Bank_Max_Kernal_Size=16,
    max_Pooling_Size=2,
    conv_Projection_Filter_Count_and_Kernal_Size_List=[(128, 3), (128, 3)],
    highway_Layer_Count=4,
    gru_Cell_Size=128,
):

    with tf.variable_scope(scope):
        with tf.variable_scope('conv_Bank'):
            #Convolution Bank
            bank_Layer_List = []
            for kernel_Size in range(1, conv_Bank_Max_Kernal_Size + 1):
                bank_Layer = Conv1D(input_Pattern,
                                    filter_Count=conv_Bank_Filter_Count,
                                    kernel_Size=kernel_Size,
                                    activation=tf.nn.relu,
                                    scope="conv1D_%d" % kernel_Size,
                                    is_Training=is_Training)
                bank_Layer_List.append(bank_Layer)
            conv_Bank_Activation = tf.concat(bank_Layer_List, axis=-1)

        #Max pooling
        max_Pooling_Activation = tf.layers.max_pooling1d(
            conv_Bank_Activation,
            pool_size=max_Pooling_Size,
            strides=1,
            padding='same')

        #Convolution Projections
        conv_Projection_Activation = max_Pooling_Activation
        for index, (filter_Count, kernel_Size) in enumerate(
                conv_Projection_Filter_Count_and_Kernal_Size_List):
            conv_Projection_Activation = Conv1D(conv_Projection_Activation,
                                                filter_Count=filter_Count,
                                                kernel_Size=kernel_Size,
                                                activation=tf.nn.relu,
                                                scope="projection_%d" % index,
                                                is_Training=is_Training)

        #Residual
        residual_Activation = conv_Projection_Activation + input_Pattern

        #Cell size correction -> But I am not sure why this code is located before the highway.
        correlected_Residual_Activation = tf.layers.dense(
            residual_Activation,
            units=gru_Cell_Size,
            activation=None,
            use_bias=True,
            name="size_Correction")

        #Highways
        highway_Activation = correlected_Residual_Activation
        for index in range(highway_Layer_Count):
            highway_Activation = Highway_Net(input_Pattern=highway_Activation,
                                             scope="highway_%d" % index)

        #Bidirectional GRU
        output_Pattern_List, rnn_State_List = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=GRUCell(gru_Cell_Size),
            cell_bw=GRUCell(gru_Cell_Size),
            inputs=highway_Activation,
            sequence_length=input_Length,
            dtype=tf.float32)

        return tf.concat(output_Pattern_List, axis=2)
예제 #15
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        JA = config.max_answer_length
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            # Char-CNN Embedding
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            # Word Embedding
            if config.use_word_emb:
                with tf.variable_scope("emb_var") as scope, tf.device(
                        "/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    tf.get_variable_scope().reuse_variables()
                    self.word_emb_scope = scope
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            [word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                # Concat Char-CNN Embedding and Word Embedding
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

            # exact match
            if config.use_exact_match:
                emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1)
                xx = tf.concat([xx, emx], 3)  # [N, M, JX, di+1]
                emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1)
                qq = tf.concat([qq, emq], 2)  # [N, JQ, di+1]

        # 2 layer highway network on Concat Embedding
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        # Bidirection-LSTM (3rd layer on paper)
        cell = GRUCell(d) if config.GRU else BasicLSTMCell(d,
                                                           state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), _ = bidirectional_dynamic_rnn(
                d_cell, d_cell, qq, q_len, dtype='float',
                scope='u1')  # [N, J, d], [N, d]
            u = tf.concat([fw_u, bw_u], 2)
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        # Attention Flow Layer (4th layer on paper)
        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    size=d,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell

        # Modeling layer (5th layer on paper)
            tp0 = p0
            for layer_idx in range(config.LSTM_num_layers - 1):
                (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                    first_cell,
                    first_cell,
                    p0,
                    x_len,
                    dtype='float',
                    scope="g_{}".format(layer_idx))  # [N, M, JX, 2d]
                p0 = tf.concat([fw_g0, bw_g0], 3)
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat([fw_g1, bw_g1], 3)  # [N, M, JX, 2d]

        # Self match layer
        with tf.variable_scope("SelfMatch"):
            s0 = tf.reshape(g1, [N * M, JX, 2 * d])  # [N * M, JX, 2d]
            x_mask = tf.reshape(self.x_mask, [N * M, JX])
            first_cell = AttentionCell(cell,
                                       s0,
                                       size=d,
                                       mask=x_mask,
                                       is_train=self.is_train)
            (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(
                first_cell, first_cell, s0, x_len, dtype='float',
                scope='s')  # [N, M, JX, 2d]
            s1 = tf.concat([fw_s, bw_s], 2)  # [N * M, JX, 2d], M == 1

            # prepare for PtrNet
            encoder_output = tf.expand_dims(s1, 1)  # [N, M, JX, 2d]
            encoder_output = tf.expand_dims(
                tf.cast(self.x_mask,
                        tf.float32), -1) * encoder_output  # [N, M, JX, 2d]

            if config.GRU:
                encoder_state_final = tf.concat((fw_s_f, bw_s_f),
                                                1,
                                                name='encoder_concat')
            else:
                if isinstance(fw_s_f, LSTMStateTuple):
                    encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c),
                                                1,
                                                name='encoder_concat_c')
                    encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h),
                                                1,
                                                name='encoder_concat_h')
                    encoder_state_final = LSTMStateTuple(c=encoder_state_c,
                                                         h=encoder_state_h)
                elif isinstance(fw_s_f, tf.Tensor):
                    encoder_state_final = tf.concat((fw_s_f, bw_s_f),
                                                    1,
                                                    name='encoder_concat')
                else:
                    encoder_state_final = None
                    tf.logging.error("encoder_state_final not set")

            print("encoder_state_final:", encoder_state_final)

        with tf.variable_scope("output"):
            # eos_symbol = config.eos_symbol
            # next_symbol = config.next_symbol

            tf.assert_equal(
                M,
                1)  # currently dynamic M is not supported, thus we assume M==1
            answer_string = tf.placeholder(
                shape=(N, 1, JA + 1), dtype=tf.int32,
                name='answer_string')  # [N, M, JA + 1]
            answer_string_mask = tf.placeholder(
                shape=(N, 1, JA + 1), dtype=tf.bool,
                name='answer_string_mask')  # [N, M, JA + 1]
            answer_string_length = tf.placeholder(
                shape=(N, 1),
                dtype=tf.int32,
                name='answer_string_length',
            )  # [N, M]
            self.tensor_dict['answer_string'] = answer_string
            self.tensor_dict['answer_string_mask'] = answer_string_mask
            self.tensor_dict['answer_string_length'] = answer_string_length
            self.answer_string = answer_string
            self.answer_string_mask = answer_string_mask
            self.answer_string_length = answer_string_length

            answer_string_flattened = tf.reshape(answer_string,
                                                 [N * M, JA + 1])
            self.answer_string_flattened = answer_string_flattened  # [N * M, JA+1]
            print("answer_string_flattened:", answer_string_flattened)

            answer_string_length_flattened = tf.reshape(
                answer_string_length, [N * M])
            self.answer_string_length_flattened = answer_string_length_flattened  # [N * M]
            print("answer_string_length_flattened:",
                  answer_string_length_flattened)

            decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(
                2 * d, state_is_tuple=True)

            with tf.variable_scope("Decoder"):
                decoder_train_logits = ptr_decoder(
                    decoder_cell,
                    tf.reshape(tp0, [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                    tf.reshape(encoder_output,
                               [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                    encoder_final_state=encoder_state_final,
                    max_encoder_length=config.sent_size_th,
                    decoder_output_length=
                    answer_string_length_flattened,  # [N * M]
                    batch_size=N,  # N * M (M=1)
                    attention_proj_dim=self.config.decoder_proj_dim,
                    scope='ptr_decoder'
                )  # [batch_size, dec_len*, enc_seq_len + 1]

                self.decoder_train_logits = decoder_train_logits
                print("decoder_train_logits:", decoder_train_logits)
                self.decoder_train_softmax = tf.nn.softmax(
                    self.decoder_train_logits)
                self.decoder_inference = tf.argmax(
                    decoder_train_logits, axis=2,
                    name='decoder_inference')  # [N, JA + 1]

            self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1
            self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
예제 #16
0
                                          trainable=False)

            with tf.name_scope("word_embedding"):
                embeddings_eng = tf.get_variable(
                    "embeddings_eng", [voc_size_eng, SIZE_EMBED_DIM])
                embed_enc = tf.nn.embedding_lookup(embeddings_eng,
                                                   enc_input,
                                                   name="embed_enc")
                embeddings_kor = tf.get_variable(
                    "embeddings_kor", [voc_size_kor, SIZE_EMBED_DIM])
                embed_dec = tf.nn.embedding_lookup(embeddings_kor,
                                                   dec_input,
                                                   name="embed_dec")

            with tf.variable_scope("encoder_layer"):
                output_enc, state_enc = bi_rnn(GRUCell(SIZE_RNN_STATE),
                                               GRUCell(SIZE_RNN_STATE),
                                               inputs=embed_enc,
                                               sequence_length=enc_seq_len,
                                               dtype=tf.float32)

                state_enc_last = tf.concat([state_enc[0], state_enc[1]],
                                           axis=1)  # [batch, state*2]

                output_enc = tf.concat(output_enc,
                                       axis=2)  # [batch, max_eng, state*2]
                output_enc = tf.nn.dropout(output_enc,
                                           keep_prob=keep_prob,
                                           name="output_enc")
                assert output_enc.get_shape()[2] == SIZE_BiRNN_STATE
                assert state_enc_last.get_shape()[1] == SIZE_BiRNN_STATE
예제 #17
0
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            self.hccho = encoder_outputs
            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs, normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
예제 #18
0
# Different placeholders
with tf.name_scope('Input'):
    batch_ph = tf.placeholder(tf.int32,[None,SEQUENCE_LENGTH],name='batch_ph')
    target_ph = tf.placeholder(tf.float32,[None],name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32,[None],name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32,name='keep_prob_ph')

# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size,EMBEDDING_DIM],-1.0,1.0),trainable=True)
    tf.summary.histogram('embeddings_var',embeddings_var)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var,batch_ph)

# (Bi-)RNN layers(-s)
rnn_outputs,_ = bi_rnn(GRUCell(HIDDEN_SIZE),GRUCell(HIDDEN_SIZE),inputs=batch_embedded,
                       sequence_length=seq_len_ph,dtype=tf.float32)
tf.summary.histogram('RNN_outputs',rnn_outputs)

# Attention layer
with tf.name_scope('Attention_layer'):
    attention_output,alphas = attention.attention(rnn_outputs,ATTENTION_SIZE,return_alphas=True)
    tf.summary.histogram('alphas',alphas)

# Dropout
drop = tf.nn.dropout(attention_output,keep_prob_ph)

# Fully connected layer
with tf.name_scope('Fully_connected_layer'):
    # Hidden size is multiplied by 2 for Bi-rnn
    W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE*2,1],stddev=0.1))
예제 #19
0
# Different placeholders
with tf.name_scope('Input_layer'):
    input_x = tf.placeholder(tf.int32, [None, maxlen], name='input_x')
    output_y = tf.placeholder(tf.float32, [None], name='output_y')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

# Embedding layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform(
        [len(word_index) + 1, embedding_dim], -1.0, 1.0),
                                 trainable=True)
    tf.summary.histogram('embeddings_var', embeddings_var)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, input_x)

# BiDirectional RNN Layer
rnn_outputs, _ = bi_rnn(GRUCell(hidden_size),
                        GRUCell(hidden_size),
                        inputs=batch_embedded,
                        dtype=tf.float32)
tf.summary.histogram('RNN_outputs', rnn_outputs)

# Attention layer
with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(rnn_outputs,
                                         attention_size,
                                         return_alphas=True)
    tf.summary.histogram('alphas', alphas)

# Dropout for attention layer
drop = tf.nn.dropout(attention_output, keep_prob)
예제 #20
0
파일: model.py 프로젝트: Innosolve/CodaLab
    def build_graph(self):
        """ Build the main architecture of the graph. """
        random.seed(310)
        tf.set_random_seed(902)
        print("building graph")

        with tf.variable_scope('model', reuse=self.reuse):
            ### Lookup ELMo Embedding ###
            self.x_elmo = layers.Lambda(
                lambda inputs: ElmoEmbedding(inputs, elmo_model),
                output_shape=(1024, ))(self.x_elmo_input)

            shape = tf.shape(self.x_elmo)
            self.shape = shape
            #            self.glove = tf.Variable(tf.random_uniform([tf.shape(self.glove)[0], self.embed_dimensions], -1.0, 1.0),trainable=True)

            if self.glove_include:
                ### Lookup Glove Vectors ###
                batch_embedded = tf.nn.embedding_lookup(self.glove, self.x)
                batch_embedded = batch_embedded[:, -shape[1]:, :]

                ### Include POS ###
                if self.pos_include:
                    ### POS-TAG Embedding ###
                    embeddings_var = tf.Variable(tf.random_uniform(
                        [12, self.pos_dimensions], -1.0, 1.0),
                                                 trainable=True)
                    self.pos_embedding = tf.nn.embedding_lookup(
                        embeddings_var, self.pos)

                    self.pos_embedded = self.pos_embedding[:, -shape[1]:, :]
                    batch_embedded = tf.concat(
                        [batch_embedded, self.pos_embedded], axis=2)

                if self.layer_1_include:
                    hid = 2 * self.hidden_size

                    if self.layer_1 == 'lstm':
                        rnn_outputs, _ = bi_rnn(
                            LSTMCell(self.hidden_size,
                                     use_peepholes=self.peephole_1),
                            LSTMCell(self.hidden_size,
                                     use_peepholes=self.peephole_2),
                            inputs=batch_embedded,
                            dtype=tf.float32,
                            scope='rnn_1')

                        fw_outputs, bw_outputs = rnn_outputs
                        layer = tf.concat([fw_outputs, bw_outputs], axis=2)
                    elif self.layer_1 == 'gru':
                        rnn_outputs, _ = bi_rnn(GRUCell(self.hidden_size),
                                                GRUCell(self.hidden_size),
                                                inputs=batch_embedded,
                                                dtype=tf.float32,
                                                scope='rnn_1')

                        fw_outputs, bw_outputs = rnn_outputs
                        layer = tf.concat([fw_outputs, bw_outputs], axis=2)
                    else:
                        conv_layer = tf.layers.conv1d(
                            inputs=batch_embedded,
                            filters=self.hidden_size * 2,
                            kernel_size=self.kernel_size,
                            strides=1,
                            padding="same",
                            activation=tf.nn.relu)
                        layer = conv_layer
                else:
                    layer = batch_embedded
                    hid = self.hidden_size
                    if self.pos_include:
                        hid += self.pos_dimensions

            print(self.hidden_size)

            # FLAGS Including ELMO and Glove
            if self.glove_include and self.elmo:
                H_1 = tf.concat([layer, self.x_elmo], axis=2)
                hid += 1024
            elif self.glove_include:
                H_1 = layer
            elif self.elmo:
                H_1 = self.x_elmo
                hid = 1024

            if self.layer_2 == 'lstm':
                rnn_outputs_2, _ = bi_rnn(
                    LSTMCell(hid, use_peepholes=self.peephole_3),
                    LSTMCell(hid, use_peepholes=self.peephole_4),
                    inputs=H_1,
                    dtype=tf.float32,
                    scope='rnn_2')

                fw_outputs_2, bw_outputs_2 = rnn_outputs_2
                H = tf.concat([fw_outputs_2, bw_outputs_2], axis=2)
            elif self.layer_2 == 'gru':
                rnn_outputs_2, _ = bi_rnn(GRUCell(hid),
                                          GRUCell(hid),
                                          inputs=H_1,
                                          dtype=tf.float32,
                                          scope='rnn_2')

                fw_outputs_2, bw_outputs_2 = rnn_outputs_2
                H = tf.concat([fw_outputs_2, bw_outputs_2], axis=2)
            elif self.layer_2 == 'conv':
                conv_layer = tf.layers.conv1d(inputs=H_1,
                                              filters=hid,
                                              kernel_size=self.kernel_size,
                                              strides=1,
                                              padding="same",
                                              activation=tf.nn.relu)
                H = conv_layer
                hid = tf.cast(hid / 2, tf.int32)
            else:
                H = H_1
                hid = tf.cast(hid / 2, tf.int32)

            hid *= 2

            ### Ask whether there is a sequence with length 0 ###
            condition = tf.equal(tf.reduce_min(self.seq_len), 0)

            ### FLAG Including attention ###
            if self.attention:
                with tf.variable_scope('attention', reuse=self.reuse):
                    M = tf.tanh(
                        H)  # M = tanh(H)  (batch_size, seq_len, HIDDEN_SIZE)

                    dropout_layer_attention = tf.layers.dropout(
                        inputs=tf.reshape(M, [-1, hid]),
                        rate=self.attention_prob,
                        training=self.is_training,
                        seed=847)
                    self.dense = tf.layers.dense(
                        inputs=dropout_layer_attention,
                        units=self.num_attention,
                        use_bias=False)
                    ### Pool - Max or Mean ###
                    if self.pool_mean:
                        self.pool = tf.reduce_mean(self.dense, axis=1)
                    else:
                        self.pool = tf.reduce_max(self.dense, axis=1)

                    ### Setting for stride 2 ###
                    #self.alpha = tf.exp(tf.reshape(self.pool,
                    #         [-1, tf.cast(tf.round(tf.add(tf.div(tf.cast(shape[1], dtype = tf.float32), 2.0), 0.1)),
                    #                      dtype = tf.int32)]))
                    self.alpha = tf.exp(tf.reshape(self.pool, [-1, shape[1]]))

                    ### Masking the sequences ###
                    if self.mask:
                        with tf.variable_scope('mask', reuse=self.reuse):
                            self.alpha = tf.reverse(self.alpha, axis=[1])
                            mask = tf.sequence_mask(self.seq_len)
                            mask = tf.to_float(mask)

                            self.alpha = tf.cond(condition, lambda: self.alpha,
                                                 lambda: self.alpha * mask)
                            self.alpha = tf.reverse(self.alpha, axis=[1])

                    #### Softmax ####
                    self.alpha = self.alpha / tf.expand_dims(
                        tf.reduce_sum(self.alpha, axis=1), 1)

                    ### Derive the word with the highest attention ###
                    pos = tf.argmax(self.alpha, axis=1)
                    sparse_tensor = tf.string_split(self.x_elmo_input)
                    dense_tensor = tf.sparse_tensor_to_dense(sparse_tensor, '')
                    rg = tf.range(0, shape[0])
                    indices = tf.transpose([rg, tf.cast(pos, tf.int32)],
                                           [1, 0])
                    self.best_example = tf.gather_nd(dense_tensor, indices)

                    ### Computing weighted average ###
                    # r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha,
                    #                                                      [-1, tf.cast(tf.round(tf.add(
                    #                                                          tf.div(tf.cast(shape[1], dtype=tf.float32),
                    #                                                                 2.0), 0.1)),
                    #                                                                   dtype=tf.int32), 1]))
                    r = tf.matmul(tf.transpose(H, [0, 2, 1]),
                                  tf.reshape(self.alpha, [-1, shape[1], 1]))
                    r = tf.squeeze(r, axis=2)
            else:
                with tf.variable_scope('rnn_average', reuse=self.reuse):
                    ### Take a simple mean of all the words (INCLUDING padding) ###
                    ### Masking the sequences ###
                    if self.mask:
                        with tf.variable_scope('mask', reuse=self.reuse):
                            self.alpha = tf.cond(
                                condition,
                                lambda: tf.tile(tf.expand_dims(shape[1], 0),
                                                tf.expand_dims(shape[0], 0)),
                                lambda: self.seq_len)
                            self.alpha = tf.reciprocal(tf.to_float(self.alpha))
                            self.alpha = tf.tile(tf.expand_dims(self.alpha, 1),
                                                 [1, shape[1]])

                            self.alpha = tf.reverse(self.alpha, axis=[1])
                            mask = tf.sequence_mask(self.seq_len)
                            mask = tf.to_float(mask)

                            self.alpha = tf.cond(condition, lambda: self.alpha,
                                                 lambda: self.alpha * mask)
                            self.alpha = tf.reverse(self.alpha, axis=[1])
                    else:
                        self.alpha = tf.tile(tf.expand_dims(shape[1], 0),
                                             tf.expand_dims(shape[0], 0))
                        self.alpha = tf.reciprocal(tf.to_float(self.alpha))
                        self.alpha = tf.tile(tf.expand_dims(self.alpha, 1),
                                             [1, shape[1]])

                    ### Necessarily here but serves no purpose - Derive the word with the highest attention ###
                    pos = tf.argmax(self.alpha, axis=1)
                    sparse_tensor = tf.string_split(self.x_elmo_input)
                    dense_tensor = tf.sparse_tensor_to_dense(sparse_tensor, '')
                    rg = tf.range(0, shape[0])
                    indices = tf.transpose([rg, tf.cast(pos, tf.int32)],
                                           [1, 0])
                    self.best_example = tf.gather_nd(dense_tensor, indices)

                    ### Computing average ###
                    r = tf.matmul(tf.transpose(H, [0, 2, 1]),
                                  tf.reshape(self.alpha, [-1, shape[1], 1]))
                    r = tf.squeeze(r, axis=2)

            self.h_star = tf.tanh(r)  # (batch , HIDDEN_SIZE)
예제 #21
0
파일: model.py 프로젝트: Asteur/own-speech
 def define_sequence_model(self):
     seed = 12345
     np.random.seed(12345)
     layer_list = []
     with self.graph.as_default() as g:
         utt_length = tf.placeholder(tf.int32, shape=(None))
         g.add_to_collection(name="utt_length", value=utt_length)
         with tf.name_scope("input"):
             input_layer = tf.placeholder(dtype=tf.float32,
                                          shape=(None, None, self.n_in),
                                          name="input_layer")
             if self.dropout_rate != 0.0:
                 print "Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate
                 is_training_drop = tf.placeholder(dtype=tf.bool,
                                                   shape=(),
                                                   name="is_training_drop")
                 input_layer_drop = dropout(input_layer,
                                            self.dropout_rate,
                                            is_training=is_training_drop)
                 layer_list.append(input_layer_drop)
                 g.add_to_collection(name="is_training_drop",
                                     value=is_training_drop)
             else:
                 layer_list.append(input_layer)
         g.add_to_collection("input_layer", layer_list[0])
         with tf.name_scope("hidden_layer"):
             basic_cell = []
             if "tanh" in self.hidden_layer_type:
                 is_training_batch = tf.placeholder(
                     dtype=tf.bool, shape=(), name="is_training_batch")
                 bn_params = {
                     "is_training": is_training_batch,
                     "decay": 0.99,
                     "updates_collections": None
                 }
                 g.add_to_collection("is_training_batch", is_training_batch)
             for i in xrange(len(self.hidden_layer_type)):
                 if self.dropout_rate != 0.0:
                     if self.hidden_layer_type[i] == "tanh":
                         new_layer = fully_connected(
                             layer_list[-1],
                             self.hidden_layer_size[i],
                             activation_fn=tf.nn.tanh,
                             normalizer_fn=batch_norm,
                             normalizer_params=bn_params)
                         new_layer_drop = dropout(
                             new_layer,
                             self.dropout_rate,
                             is_training=is_training_drop)
                         layer_list.append(new_layer_drop)
                     if self.hidden_layer_type[i] == "lstm":
                         basic_cell.append(
                             MyDropoutWrapper(BasicLSTMCell(
                                 num_units=self.hidden_layer_size[i]),
                                              self.dropout_rate,
                                              self.dropout_rate,
                                              is_training=is_training_drop))
                     if self.hidden_layer_type[i] == "gru":
                         basic_cell.append(
                             MyDropoutWrapper(GRUCell(
                                 num_units=self.hidden_layer_size[i]),
                                              self.dropout_rate,
                                              self.dropout_rate,
                                              is_training=is_training_drop))
                 else:
                     if self.hidden_layer_type[i] == "tanh":
                         new_layer = fully_connected(
                             layer_list[-1],
                             self.hidden_layer_size[i],
                             activation_fn=tf.nn.tanh,
                             normalizer_fn=batch_norm,
                             normalizer_params=bn_params)
                         layer_list.append(new_layer)
                     if self.hidden_layer_type[i] == "lstm":
                         basic_cell.append(
                             LayerNormBasicLSTMCell(
                                 num_units=self.hidden_layer_size[i]))
                     if self.hidden_layer_type[i] == "gru":
                         basic_cell.append(
                             LayerNormGRUCell(
                                 num_units=self.hidden_layer_size[i]))
             multi_cell = MultiRNNCell(basic_cell)
             rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
                 multi_cell,
                 layer_list[-1],
                 dtype=tf.float32,
                 sequence_length=utt_length)
             layer_list.append(rnn_outputs)
         with tf.name_scope("output_layer"):
             if self.output_type == "linear":
                 output_layer = tf.layers.dense(rnn_outputs, self.n_out)
             #  stacked_rnn_outputs=tf.reshape(rnn_outputs,[-1,self.n_out])
             #  stacked_outputs=tf.layers.dense(stacked_rnn_outputs,self.n_out)
             #  output_layer=tf.reshape(stacked_outputs,[-1,utt_length,self.n_out])
             g.add_to_collection(name="output_layer", value=output_layer)
         with tf.name_scope("training_op"):
             if self.optimizer == "adam":
                 self.training_op = tf.train.AdamOptimizer()
예제 #22
0
    def _build_graph(self):
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(now)
        print("Build Graph...")
        print()

        self.xavier_init = tf.contrib.layers.xavier_initializer()

        self.embed_dim = 100
        self.state_dim = 100
        self.bi_state_dim = self.state_dim * 2
        self.attend_dim = 250
        self.feat_dim = self.bi_state_dim
        self.fc_dim = 150

        print("embed_dim : %d" % self.embed_dim)
        print("state_dim : %d" % self.state_dim)
        print("bi_state_dim : %d" % self.bi_state_dim)
        print("attend_dim : %d" % self.attend_dim)
        print("feat_dim : %d" % self.feat_dim)
        print("fc_dim : %d" % self.fc_dim)
        print()

        with tf.device(self.dev):
            with tf.variable_scope("input_placeholders"):
                self.enc_input = tf.placeholder(tf.int32,
                                                shape=[None, None],
                                                name="enc_input")
                self.enc_seq_len = tf.placeholder(tf.int32,
                                                  shape=[
                                                      None,
                                                  ],
                                                  name="enc_seq_len")
                self.targets = tf.placeholder(tf.int32,
                                              shape=[
                                                  None,
                                              ],
                                              name="targets")
                self.batch_size = tf.placeholder(tf.int32,
                                                 shape=[],
                                                 name="batch_size")
                self.keep_prob = tf.placeholder(tf.float32, name="keep_prob")

            with tf.variable_scope("words_embedding"):
                self.embeddings = tf.get_variable(
                    "embeddings", [self.voc_size, self.embed_dim],
                    initializer=self.xavier_init)
                self.embed_in = tf.nn.embedding_lookup(self.embeddings,
                                                       self.enc_input,
                                                       name="embed_in")

                self.pad_mask = tf.sequence_mask(self.enc_seq_len,
                                                 self.input_len_max,
                                                 dtype=tf.float32,
                                                 name="pad_mask1")

            with tf.variable_scope("rnn_encoder_layer"):
                self.output_enc, self.state_enc = bi_rnn(
                    GRUCell(self.state_dim),
                    GRUCell(self.state_dim),
                    inputs=self.embed_in,
                    sequence_length=self.enc_seq_len,
                    dtype=tf.float32)

                self.state_enc = tf.concat(
                    [self.state_enc[0], self.state_enc[1]],
                    axis=1,
                    name="state_enc1")
                assert self.state_enc.get_shape()[1] == self.bi_state_dim

                self.output_enc = tf.concat(
                    self.output_enc, axis=2)  # [batch, max_eng, state*2]
                self.output_enc = tf.nn.dropout(self.output_enc,
                                                keep_prob=self.keep_prob,
                                                name="output_enc1")
                print("output_enc.get_shape() : %s" %
                      (self.output_enc.get_shape()))
                assert self.output_enc.get_shape()[2] == self.bi_state_dim

            with tf.variable_scope("attention_layer"):
                self.rows = 30
                self.W_s1 = tf.get_variable(
                    "W_s1", [1, 1, self.feat_dim, self.attend_dim],
                    initializer=self.xavier_init)
                self.bias_s1 = tf.get_variable("bias_s1", [self.attend_dim])
                self.W_s2 = tf.get_variable("W_s2",
                                            [self.attend_dim, self.rows],
                                            initializer=self.xavier_init)

                self.identity = tf.reshape(
                    tf.tile(tf.diag(tf.ones(self.rows)), [self.batch_size, 1]),
                    [self.batch_size, self.rows, self.rows],
                    name="identity")

                self.output_enc_ex = tf.reshape(
                    self.output_enc,
                    [-1, self.input_len_max, 1, self.feat_dim])
                self.context_att = tf.nn.conv2d(self.output_enc_ex,
                                                self.W_s1,
                                                strides=[1, 1, 1, 1],
                                                padding="SAME")

                self.context_att = tf.tanh(tf.nn.bias_add(
                    self.context_att, self.bias_s1),
                                           name="context_att")
                print("context_att.get_shape() : %s" %
                      (self.context_att.get_shape()))

                # attention
                self.attention_tot = tf.matmul(
                    tf.reshape(self.context_att, [-1, self.attend_dim]),
                    self.W_s2)
                self.attention_tot = tf.reshape(
                    self.attention_tot, [-1, self.input_len_max, self.rows])
                self.attention_tot = tf.nn.softmax(
                    self.attention_tot, dim=1) * tf.reshape(
                        self.pad_mask, [-1, self.input_len_max, 1])
                self.attention_tot = tf.nn.softmax(self.attention_tot, dim=1)
                print("attention_tot.get_shape() : %s" %
                      (self.attention_tot.get_shape()))

                self.attention = tf.reduce_sum(self.attention_tot, axis=2)
                self.attention = tf.reshape(
                    self.attention,
                    [self.batch_size, self.input_len_max]) * self.pad_mask
                self.attention = tf.nn.softmax(self.attention)
                print("attention.get_shape() : %s" %
                      (self.attention.get_shape()))

                self.attention_tot_T = tf.transpose(self.attention_tot,
                                                    [0, 2, 1],
                                                    name="attention_tot_T")
                self.AA_t = tf.matmul(self.attention_tot_T,
                                      self.attention_tot) - self.identity
                print("AA_t.get_shape() : %s" % (self.AA_t.get_shape()))

                # penalty
                self.P = tf.square(tf.norm(self.AA_t, axis=[-2, -1],
                                           ord="fro"))
                self.P = tf.reduce_mean(self.P, name="P")

                # context..
                self.context = tf.reduce_sum(
                    self.output_enc *
                    tf.reshape(self.attention, [-1, self.input_len_max, 1]),
                    axis=1,
                    name="context")
                print("context.get_shape() : %s" % (self.context.get_shape()))
                assert self.context.get_shape()[1] == self.feat_dim

            with tf.variable_scope("dense_layer"):
                self.W_out1 = tf.get_variable("W_out1",
                                              [self.feat_dim, self.fc_dim],
                                              initializer=self.xavier_init)
                self.bias_out1 = tf.get_variable("bias_out1", [self.fc_dim])
                self.W_out2 = tf.get_variable("W_out2",
                                              [self.fc_dim, self.target_size],
                                              initializer=self.xavier_init)
                self.bias_out2 = tf.get_variable("bias_out2",
                                                 [self.target_size])

                self.fc = tf.nn.xw_plus_b(self.context, self.W_out1,
                                          self.bias_out1)
                self.fc = tf.tanh(self.fc)
                print("fc.get_shape() : %s" % (self.fc.get_shape()))

                self.y_hat = tf.nn.xw_plus_b(self.fc,
                                             self.W_out2,
                                             self.bias_out2,
                                             name="y_hat")
                print("y_hat.get_shape() : %s" % (self.y_hat.get_shape()))

            with tf.variable_scope("train_optimization"):
                self.train_vars = tf.trainable_variables()

                print()
                print("trainable_variables")
                for varvar in self.train_vars:
                    print(varvar)
                print()

                self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.y_hat, labels=self.targets)
                self.loss = tf.reduce_mean(self.loss, name="loss")
                self.loss_l2 = tf.add_n([
                    tf.nn.l2_loss(v)
                    for v in self.train_vars if "bias" not in v.name
                ]) * 0.0001
                self.loss = self.loss + self.loss_l2 + self.P

                self.predict = tf.argmax(tf.nn.softmax(self.y_hat), 1)
                self.predict = tf.cast(tf.reshape(self.predict,
                                                  [self.batch_size, 1]),
                                       tf.int32,
                                       name="predict")

                self.target_label = tf.cast(
                    tf.reshape(self.targets, [self.batch_size, 1]), tf.int32)
                self.correct = tf.equal(self.predict, self.target_label)
                self.accuracy = tf.reduce_mean(tf.cast(self.correct,
                                                       tf.float32),
                                               name="accuracy")

                self.global_step = tf.Variable(0,
                                               name="global_step",
                                               trainable=False)
                self.decay_rate = tf.maximum(0.00007,
                                             tf.train.exponential_decay(
                                                 self.lr,
                                                 self.global_step,
                                                 1000,
                                                 0.9,
                                                 staircase=True),
                                             name="decay_rate")
                self.opt = tf.train.AdamOptimizer(
                    learning_rate=self.decay_rate)
                self.grads_and_vars = self.opt.compute_gradients(
                    self.loss, self.train_vars)
                self.grads_and_vars = [(tf.clip_by_norm(g, 0.5), v)
                                       for g, v in self.grads_and_vars]
                self.grads_and_vars = [
                    (tf.add(g, tf.random_normal(tf.shape(g), stddev=0.001)), v)
                    for g, v in self.grads_and_vars
                ]

                self.train_op = self.opt.apply_gradients(
                    self.grads_and_vars,
                    global_step=self.global_step,
                    name="train_op")

            # Summaries for loss and lr
            self.loss_summary = tf.summary.scalar("loss", self.loss)
            self.accuracy_summary = tf.summary.scalar("accuracy",
                                                      self.accuracy)
            self.lr_summary = tf.summary.scalar("lr", self.decay_rate)

            # Output directory for models and summaries
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
            self.out_dir = os.path.abspath(
                os.path.join("./model/rnn_self_att", timestamp))
            print("LOGDIR = %s" % self.out_dir)
            print()

            # Train Summaries
            self.train_summary_op = tf.summary.merge(
                [self.loss_summary, self.accuracy_summary, self.lr_summary])
            self.train_summary_dir = os.path.join(self.out_dir, "summary",
                                                  "train")
            self.train_summary_writer = tf.summary.FileWriter(
                self.train_summary_dir, self.sess.graph)

            # Test summaries
            self.test_summary_op = tf.summary.merge(
                [self.loss_summary, self.accuracy_summary, self.lr_summary])
            self.test_summary_dir = os.path.join(self.out_dir, "summary",
                                                 "test")
            self.test_summary_writer = tf.summary.FileWriter(
                self.test_summary_dir, self.sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            self.checkpoint_dir = os.path.abspath(
                os.path.join(self.out_dir, "checkpoints"))
            self.checkpoint_prefix = os.path.join(self.checkpoint_dir,
                                                  "model-step")
            if self.makedir:
                if not os.path.exists(self.checkpoint_dir):
                    os.makedirs(self.checkpoint_dir)

            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=None)
예제 #23
0
    for i in r:
        sample = data[i: i + long]
        a.append(sample[:-1, :5])
        b.append(sample[:-1, 5:10])
        c.append(sample[-1][1])
    return a, b, c


x = tf.placeholder(shape=[batch_size, long - 1, 5], dtype=tf.float16)
y = tf.placeholder(shape=[batch_size, long - 1, 5], dtype=tf.float16)
z_ = tf.placeholder(shape=[batch_size], dtype=tf.float16)

X = tf.nn.sigmoid(x) - 0.5
Y = tf.nn.sigmoid(y) - 0.5

gru_x = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu)
state_x = gru_x.zero_state(batch_size, dtype=tf.float16)
with tf.variable_scope('RNN_x'):
    for timestep in range(long - 1):
        if timestep == 1:
            tf.get_variable_scope().reuse_variables()
        (cell_output_x, state_x) = gru_x(X[:, timestep], state_x)
    out_put_x = state_x

gru_y = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu)
state_y = gru_y.zero_state(batch_size, dtype=tf.float16)
with tf.variable_scope('RNN_y'):
    for timestep in range(long - 1):  # be careful
        if timestep == 1:
            tf.get_variable_scope().reuse_variables()
        (cell_output_y, state_y) = gru_y(Y[:, timestep], state_y)
예제 #24
0
num_samples = tf.shape(inputs)[0] # useful for later

# embedding
We = np.random.randn(V, embedding_dim).astype(np.float32)

# output layer
Wo = init_weight(hidden_layer_size, K).astype(np.float32)
bo = np.zeros(K).astype(np.float32)

# make them tensorflow variables
tfWe = tf.Variable(We)
tfWo = tf.Variable(Wo)
tfbo = tf.Variable(bo)

# make the rnn unit
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu)


# get the output
x = tf.nn.embedding_lookup(tfWe, inputs)

# converts x from a tensor of shape N x T x D
# into a list of length T, where each element is a tensor of shape N x D
x = tf.unstack(x, sequence_length, 1)

# get the rnn output
outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32)


# outputs are now of size (T, N, M)
# so make it (N, T, M)
예제 #25
0
def cbhg(inputs,
         input_lengths,
         is_training,
         bank_size,
         bank_channel_size,
         maxpool_width,
         highway_depth,
         rnn_size,
         proj_sizes,
         proj_width,
         scope,
         before_highway=None,
         encoder_rnn_init_state=None):
    """
    Args:
        inputs: input tensor
        input_lengths: length of input tensor
        is_training: Batch Normalization option in Conv1D
        scope: network or model name
        K: kernel size range
        projections: projection layers option
        depth: dimensionality option of Highway net and Bidirectical GRU's output
    The layers in the code are staked in the order in which they came out.
    """

    batch_size = tf.shape(inputs)[0]
    with tf.variable_scope(scope):
        with tf.variable_scope('conv_bank'):

            conv_outputs = tf.concat(
                [
                    conv1d(inputs, k, 128, tf.nn.relu, is_training,
                           'conv1d_%d' % k) for k in range(1, bank_size + 1)
                ],  #1D Convolution layers using multiple types of Convolution Kernel.
                axis=-1  #Iterate K with increasing filter size by 1.
            )  # Convolution bank: concatenate on the last axis to stack channels from all convolutions

        # Maxpooling:
        maxpool_output = tf.layers.max_pooling1d(
            conv_outputs, pool_size=maxpool_width, strides=1,
            padding='same')  #1D Maxpooling layer(strides=1, width=2)

        # Two projection layers:
        proj1_output = conv1d(maxpool_output, proj_width, projections[0],
                              tf.nn.relu, is_training,
                              'proj_1')  #1st Conv1D projections
        proj2_output = conv1d(proj1_output, proj_width, projections[1], None,
                              is_training, 'proj_2')  #2nd Conv1D projections

        # Residual connection:
        if before_highway is not None:
            expanded_before_highway = tf.expand_dims(before_highway, [1])
            tiled_before_highway = tf.tile(expanded_before_highway,
                                           [1, tf.shape(proj2_out)[1], 1])
            highway_input = proj2_out + inputs + tiled_before_highway

        else:
            highway_input = proj2_out + inputs

        # Handle dimensionality mismatch:
        if highway_input.shape[2] != rnn_size:
            highway_input = tf.layers.dense(highway_input, rnn_size)

        # 4-layer HighwayNet:
        for idx in range(highway_depth):
            highway_input = highwaynet(highway_input, 'highway_%d' %
                                       (idx + 1))  #make 4 Highway net layers
        rnn_input = highway_input

        # Bidirectional RNN
        if encoder_rnn_init_state is not None:
            initial_state_fw, initial_state_bw = tf.split(
                encoder_rnn_init_state, 2, 1)
        else:
            initial_state_fw, initial_state_bw = None, None

        outputs, states = tf.nn.bidirectional_dynamic_rnn(  #make Bidirectional GRU
            GRUCell(rnn_size),
            GRUCell(rnn_size),
            rnn_input,
            sequence_length=input_lengths,
            initial_state_fw=initial_state_fw,
            initial_state_bw=initial_state_bw,
            dtype=tf.float32)
        return tf.concat(
            outputs, axis=2)  # Concat forward sequence and backward sequence
예제 #26
0
def GNN(label, data, batch_size, hidden_size, n_steps, num_category, graph):

    gru_cell = GRUCell(hidden_size)
    w_in = weights('in_' + label, hidden_size, 0)
    h0 = tf.reshape(
        tf.matmul(data[:, 0, :], w_in),
        [batch_size, hidden_size])  #initialize h0 [batchsize, hidden_state]
    for i in range(1, num_category):
        w_in = weights('in_' + label, hidden_size, i)
        h0 = tf.concat([
            h0,
            tf.reshape(tf.matmul(data[:, i, :], w_in),
                       [batch_size, hidden_size])
        ], 1)
    h0 = tf.reshape(h0, [batch_size, num_category, hidden_size
                         ])  # h0: [batchsize, num_category, hidden_state]
    ini = h0
    h0 = tf.nn.tanh(h0)

    state = h0
    sum_graph = tf.reduce_sum(graph, reduction_indices=1)
    enable_node = tf.cast(tf.cast(sum_graph, dtype=bool), dtype=tf.float32)

    with tf.variable_scope("gnn"):
        for step in range(n_steps):
            if step > 0: tf.get_variable_scope().reuse_variables()
            # state = state * mask_x
            x = message_pass(label, state, hidden_size, batch_size,
                             num_category, graph)
            # x = tf.reshape(x, [batch_size*num_category, hidden_size])
            # state = tf.reshape(state, [batch_size*num_category, hidden_size])
            (x_new, state_new) = gru_cell(x[0], state[0])
            state_new = tf.transpose(state_new, (1, 0))
            state_new = tf.multiply(state_new, enable_node[0])
            state_new = tf.transpose(state_new, (1, 0))
            for i in range(1, batch_size):
                (x_, state_) = gru_cell(
                    x[i],
                    state[i])  # #input of GRUCell must be 2 rank, not 3 rank
                state_ = tf.transpose(state_, (1, 0))
                state_ = tf.multiply(state_, enable_node[i])
                state_ = tf.transpose(state_, (1, 0))
                state_new = tf.concat([state_new, state_], 0)
            # x = tf.reshape(x, [batch_size, num_category, hidden_size])
            state = tf.reshape(state_new,
                               [batch_size, num_category, hidden_size
                                ])  # #restore: 2 rank to 3 rank
            # state = state * mask_x
            # state = tf.nn.dropout(state, keep_prob)

    # w_out_image = weights('out_image', hidden_size, 0)
    # b_out_image = biases('out_image', hidden_size, 0)
    # output = tf.reshape(tf.matmul(state[:, 0, :], w_out_image) + b_out_image, [batch_size, 2048]) #initialize output : [batchsize, 2048]
    # for i in range(1, num_category):
    #     w_out_image = weights('out_image', hidden_size, i)
    #     b_out_image = biases('out_image', hidden_size, i)
    #     output = tf.concat([output, tf.reshape(
    #         tf.matmul(state[:, i, :], w_out_image) + b_out_image,
    #                        [batch_size, 2048])], 1)
    # output = tf.reshape(output, [batch_size, num_category, 2048])
    # output = tf.nn.tanh(output)

    return state, ini
        def __graph__():
            with tf.name_scope('input'):
                x_input = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, sequence_width, sequence_height],
                    name='x_input')
                y_input = tf.placeholder(dtype=tf.float32,
                                         shape=[None, num_classes],
                                         name='y_input')

            # state = tf.placeholder(dtype=tf.float32, shape=[None, self.cell_size * self.num_layers],
            #                        name='initial_state')
            p_keep = tf.placeholder(dtype=tf.float32, name='p_keep')

            learning_rate = tf.placeholder(dtype=tf.float32,
                                           name='learning_rate')

            hidden_size = int(sequence_width)
            # seq_len = tf.Variable(tf.constant(hidden_size),name='seq_len')

            rnn_outputs, _ = bi_rnn(GRUCell(hidden_size),
                                    GRUCell(hidden_size),
                                    inputs=x_input,
                                    sequence_length=None,
                                    dtype=tf.float32)
            tf.summary.histogram('RNN_outputs', rnn_outputs)

            # Attention layer
            with tf.name_scope('Attention_layer'):
                attention_output, alphas = attention(
                    input=rnn_outputs,
                    hidden_size=self.sequence_width,
                    attention_size=ATTENTION_SIZE,
                    return_alpha=True)
                tf.summary.histogram('alphas', alphas)

            # dropout
            drop = tf.nn.dropout(attention_output, keep_prob=p_keep)

            # fully connected layer
            with tf.name_scope('Fully_connected_layer'):
                W = tf.Variable(tf.truncated_normal(
                    [hidden_size * 2, self.num_classes], stddev=0.1),
                                name='W')
                b = tf.Variable(tf.constant(0.0, shape=[self.num_classes]),
                                name='b')
                y_hat = tf.nn.xw_plus_b(drop, W, b)
                # y_hat=tf.squeeze(y_hat)
                tf.summary.histogram('W', W)

            with tf.name_scope('loss'):
                loss = svm_loss(labels=y_input,
                                logits=y_hat,
                                num_classes=self.num_classes,
                                penalty_parameter=self.svm_c,
                                weight=W)
            tf.summary.scalar('loss', loss)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(loss=loss)

            with tf.name_scope('accuracy'):
                predicted_class = tf.sign(y_hat)
                predicted_class = tf.identity(predicted_class,
                                              name='predicted_class')
                with tf.name_scope('correct_prediction'):
                    correct = tf.equal(tf.argmax(predicted_class, 1),
                                       tf.argmax(y_input, 1))
                with tf.name_scope('accuracy'):
                    accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
            tf.summary.scalar('accuracy', accuracy)

            merged = tf.summary.merge_all()

            # set class properties
            self.x_input = x_input
            self.y_input = y_input
            self.p_keep = p_keep
            self.loss = loss
            self.optimizer = optimizer
            # self.state=state
            # self.states=states
            self.learning_rate = learning_rate
            self.predicted_class = predicted_class
            self.accuracy = accuracy
            self.merged = merged
예제 #28
0
def main(model, T, n_iter, n_batch, n_hidden, capacity, comp, FFT,
         learning_rate, decay, learning_rate_decay, norm, grid_name):
    learning_rate = float(learning_rate)
    decay = float(decay)

    # --- Set data params ----------------
    n_input = 10
    n_output = 9
    n_sequence = 10
    n_train = n_iter * n_batch
    n_test = n_batch

    n_steps = T + 20
    n_classes = 9

    # --- Create data --------------------
    train_x, train_y = copying_data(T, n_train, n_sequence)
    test_x, test_y = copying_data(T, n_test, n_sequence)

    # --- Create graph and compute gradients ----------------------
    with tf.name_scope('inputs'):
        x = tf.placeholder("int32", [None, n_steps], name='x_input')
        y = tf.placeholder("int64", [None, n_steps], name='y_input')

    input_data = tf.one_hot(x, n_input, dtype=tf.float32)

    # --- Input to hidden layer ----------------------
    #with tf.name_scope('layer'):

    if model == "LSTM":
        cell = BasicLSTMCell(n_hidden, state_is_tuple=True, forget_bias=1)
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)
    elif model == "GRU":
        cell = GRUCell(n_hidden,
                       kernel_initializer=tf.orthogonal_initializer())
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)
    elif model == "RUM":
        cell = RUMCell(n_hidden, T_norm=norm)
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)
    elif model == "ARUM":
        cell = ARUMCell(n_hidden, T_norm=norm)
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)
    elif model == "EUNN":
        cell = EUNNCell(n_hidden, capacity, FFT, comp)
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)
    elif model == "GORU":
        cell = GORUCell(n_hidden, capacity, FFT)
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)
    elif model == "RNN":
        cell = BasicRNNCell(n_hidden)
        hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)

    # --- Hidden Layer to Output ----------------------

    V_init_val = np.sqrt(6.) / np.sqrt(n_output + n_input)

    V_weights = tf.get_variable("V_weights",
                                shape=[n_hidden, n_classes],
                                dtype=tf.float32,
                                initializer=tf.random_uniform_initializer(
                                    -V_init_val, V_init_val))

    V_bias = tf.get_variable("V_bias",
                             shape=[n_classes],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.01))

    hidden_out_list = tf.unstack(hidden_out, axis=1)
    temp_out = tf.stack([tf.matmul(i, V_weights) for i in hidden_out_list])
    output_data = tf.nn.bias_add(tf.transpose(temp_out, [1, 0, 2]), V_bias)

    # --- evaluate process ----------------------
    with tf.name_scope('evaluate'):
        with tf.name_scope('cost'):
            cost = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=output_data, labels=y))
            tf.summary.scalar('cost', cost)
        with tf.name_scope('correnct_pred'):
            correct_pred = tf.equal(tf.argmax(output_data, 2), y)
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            tf.summary.scalar('accuracy', accuracy)

    # --- Initialization ----------------------
    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                          decay=decay).minimize(cost)
    init = tf.global_variables_initializer()

    print("\n###")
    sumz = 0
    for i in tf.global_variables():
        print(i.name, i.shape, np.prod(np.array(i.get_shape().as_list())))
        sumz += np.prod(np.array(i.get_shape().as_list()))
    print("# parameters: ", sumz)
    print("###\n")

    # --- save result ----------------------
    filename = "./output/copying/"
    if grid_name != None:
        filename += grid_name + "/"
    filename += "T=" + str(T) + "/"
    research_filename = filename + "researchModels" + "/" + model + "_N=" + str(
        n_hidden) + "_lambda=" + str(learning_rate) + "_decay=" + str(
            decay) + "/"
    filename += model + "_N=" + str(n_hidden) + "_lambda=" + str(
        learning_rate) + "_decay=" + str(decay)
    if norm is not None:
        filename += "_norm=" + str(norm)
    filename = filename + ".txt"

    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    if not os.path.exists(os.path.dirname(research_filename)):
        try:
            os.makedirs(os.path.dirname(research_filename))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    if not os.path.exists(
            os.path.dirname(research_filename + "/modelCheckpoint/")):
        try:
            os.makedirs(
                os.path.dirname(research_filename + "/modelCheckpoint/"))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    f = open(filename, 'w')
    f.write("########\n\n")
    f.write("## \tModel: %s with N=%d" % (model, n_hidden))
    f.write("\n\n")
    f.write("########\n\n")

    # --- Training Loop ----------------------
    saver = tf.train.Saver()
    mx2 = 0
    step = 0
    with tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                          allow_soft_placement=False)) as sess:
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter("./logs/", sess.graph)

        sess.run(init)

        steps = []
        losses = []
        accs = []

        while step < n_iter:

            batch_x = train_x[step * n_batch:(step + 1) * n_batch]
            batch_y = train_y[step * n_batch:(step + 1) * n_batch]

            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

            result = sess.run(merged, feed_dict={x: batch_x, y: batch_y})
            writer.add_summary(result, step)
            result = sess.run(merged, feed_dict={x: batch_x, y: batch_y})
            writer.add_summary(result, step)
            #with tf.name_scope('loss'):
            with tf.name_scope('loss'):
                with tf.name_scope('acc'):
                    acc = sess.run(accuracy,
                                   feed_dict={
                                       x: batch_x,
                                       y: batch_y
                                   })
                with tf.name_scope('loss'):
                    loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})

                    tf.summary.scalar('loss', loss)
            merged = tf.summary.merge_all()
            write = tf.summary.FileWriter("logs/", sess.graph)
            result = sess.run(merged, feed_dict={x: batch_x, y: batch_y})
            writer.add_summary(result, step)

            print("Iter " + str(step) + ", Minibatch Loss= " + \
               "{:.6f}".format(loss) + ", Training Accuracy= " + \
               "{:.5f}".format(acc))

            steps.append(step)
            losses.append(loss)
            accs.append(acc)
            if step == 0:
                f.write("%d\t%f\t%f\n" % (step, loss, acc))
            step += 1
            if step % 200 == 199:
                f.write("%d\t%f\t%f\n" % (step, loss, acc))

            if step % 10000 == 0:
                saver.save(sess, research_filename + "/modelCheckpoint/")

            if step % 1000 == 0:
                if model == "GRU": tmp = "gru"
                if model == "RUM": tmp = "rum"
                if model == "ARUM": tmp = "arum"
                if model == "GRU" or model == "RUM" or model == "ARUM":
                    kernel = [
                        v for v in tf.global_variables()
                        if v.name == "rnn/" + tmp + "_cell/gates/kernel:0"
                    ][0]
                    bias = [
                        v for v in tf.global_variables()
                        if v.name == "rnn/" + tmp + "_cell/gates/bias:0"
                    ][0]
                    k, b = sess.run([kernel, bias])
                    np.save(research_filename + "/kernel_" + str(step), k)
                    np.save(research_filename + "/bias_" + str(step), b)
                if model == "RUM" or model == "ARUM":
                    kernel_emb = [
                        v for v in tf.global_variables()
                        if v.name == "rnn/" + tmp + "_cell/candidate/kernel:0"
                    ][0]
                    bias_emb = [
                        v for v in tf.global_variables()
                        if v.name == "rnn/" + tmp + "_cell/candidate/bias:0"
                    ][0]
                    k_emb, b_emb = sess.run([kernel_emb, bias_emb])
                    np.save(research_filename + "/kernel_emb_" + str(step),
                            k_emb)
                    np.save(research_filename + "/bias_emb_" + str(step),
                            b_emb)

                    #result = sess.run(merged,feed_dict={x: batch_x, y: batch_y})
                    #writer.add_summary(result, step)
        print("Optimization Finished!")

        # --- test ----------------------
        test_acc = sess.run(accuracy, feed_dict={x: test_x, y: test_y})
        test_loss = sess.run(cost, feed_dict={x: test_x, y: test_y})
        #tf.scalar_summary('test_loss',test_loss)
        #result = sess.run(merged,feed_dict={x: batch_x, y: batch_y})
        #writer.add_summary(result, step)
        f.write("Test result: Loss= " + "{:.6f}".format(test_loss) + \
           ", Accuracy= " + "{:.5f}".format(test_acc))
    def __init__(self,
                 sequence_length,
                 num_classes,
                 text_vocab_size,
                 text_embedding_size,
                 hidden_size=800,
                 l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_text = tf.placeholder(tf.int32,
                                         shape=[None, sequence_length],
                                         name='input_text')
        self.input_y = tf.placeholder(tf.float32,
                                      shape=[None, num_classes],
                                      name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name='dropout_keep_prob')
        self.dropout_keep_prob_lstm = tf.placeholder(tf.float32,
                                                     name='dropout_keep_prob')
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("text-embedding"):
            self.W_text = tf.Variable(tf.random_uniform(
                [text_vocab_size, text_embedding_size], -1.0, 1.0),
                                      name="W_text")
            self.text_embedded_chars = tf.nn.embedding_lookup(
                self.W_text, self.input_text)

        # (Bi-)RNN layer(-s)
        self.rnn_outputs, _ = bi_rnn(
            tf.nn.rnn_cell.DropoutWrapper(GRUCell(hidden_size),
                                          self.dropout_keep_prob_lstm),
            tf.nn.rnn_cell.DropoutWrapper(GRUCell(hidden_size),
                                          self.dropout_keep_prob_lstm),
            inputs=self.text_embedded_chars,
            dtype=tf.float32)
        print(self.rnn_outputs)
        tf.summary.histogram('RNN_outputs', self.rnn_outputs)

        # 双向tensor拼接
        rnn_outputs = tf.concat([self.rnn_outputs[0], self.rnn_outputs[1]], 2)
        # 降维
        rnn_outputs = tf.reduce_sum(rnn_outputs, 1)

        # Dropout
        self.drop = tf.nn.dropout(rnn_outputs, self.dropout_keep_prob)

        # Fully connected layer
        with tf.name_scope('Fully_connected_layer'):
            W = tf.Variable(
                tf.truncated_normal(
                    [hidden_size * 2, num_classes],
                    stddev=0.1))  # Hidden size is multiplied by 2 for Bi-RNN
            b = tf.Variable(tf.constant(0., shape=[num_classes]))
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   "float"),
                                           name="accuracy")
예제 #30
0
    def __init__(self, config, is_training=False):
        self.config = config
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.hidden_size = hidden_size = config.hidden_size
        self.num_layers = 1
        vocab_size = config.vocab_size
        self.max_grad_norm = config.max_grad_norm
        self.use_lstm = config.use_lstm

        # Placeholders for inputs.
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.initial_state = array_ops.zeros(
            tf.stack([self.batch_size, self.num_steps]),
            dtype=tf.float32).set_shape([None, self.num_steps])

        embedding = tf.get_variable(
            'embedding', [self.config.vocab_size, self.config.hidden_size])

        # Set up ACT cell and inner rnn-type cell for use inside the ACT cell.
        with tf.variable_scope("rnn"):
            if self.use_lstm:
                inner_cell = BasicLSTMCell(self.config.hidden_size)
            else:
                inner_cell = GRUCell(self.config.hidden_size)

        with tf.variable_scope("ACT"):

            act = ACTCell(self.config.hidden_size,
                          inner_cell,
                          config.epsilon,
                          max_computation=config.max_computation,
                          batch_size=self.batch_size)

        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        inputs = [
            tf.squeeze(single_input, [1])
            for single_input in tf.split(inputs, self.config.num_steps, 1)
        ]

        self.outputs, final_state = static_rnn(act, inputs, dtype=tf.float32)

        # Softmax to get probability distribution over vocab.
        output = tf.reshape(tf.concat(self.outputs, 1), [-1, hidden_size])
        softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        self.logits = tf.matmul(
            output,
            softmax_w) + softmax_b  # dim (numsteps*batchsize, vocabsize)

        loss = sequence_loss_by_example([self.logits],
                                        [tf.reshape(self.targets, [-1])],
                                        [tf.ones([batch_size * num_steps])],
                                        vocab_size)

        # Add up loss and retrieve batch-normalised ponder cost: sum N + sum Remainder.
        ponder_cost = act.calculate_ponder_cost(
            time_penalty=self.config.ponder_time_penalty)
        self.cost = (tf.reduce_sum(loss) / batch_size) + ponder_cost
        self.final_state = self.outputs[-1]

        if is_training:
            self.lr = tf.Variable(0.0, trainable=False)
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                              self.max_grad_norm)
            optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))