def build_generator(self, inputs):
        if ModeKeys.is_predict_one(self.mode):
            self.attention_bias = None
        else:
            self.attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]
        self.encoder_outputs = self.encode(
            inputs, self.attention_bias)  # [batch, src_len, hidden_size]
        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      self.encoder_outputs,
                                                      None, None, None)

        if self.is_train:
            # if self.mode == tf.estimator.ModeKeys.TRAIN:
            tf.logging.info("!!!!!! using rl predict in traning !!!!!!")
            decoded_ids, decoded_logits, log_probs = self.rl_predict(
                self.encoder_outputs, self.attention_bias)
            return decoded_ids, decoded_logits, log_probs
        else:
            tf.logging.info(
                "!!!!!!! using argmax_predict in prediction/evaluation !!!!!!!!"
            )
            decoded_ids, decoded_logits = self.argmax_predict(
                self.encoder_outputs, self.attention_bias)
            return decoded_ids, decoded_logits, _
示例#2
0
    def build_padding_rollout_generator(self, real_inputs, gen_samples, max_len, given_num):
        with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE):
            if ModeKeys.is_predict_one(self.mode):
                self.attention_bias = None
            else:
                self.attention_bias = model_utils.get_padding_bias(real_inputs)
            self.encoder_outputs = self.encode(real_inputs, self.attention_bias)

            def condition(given_num, _):
                return given_num < max_len

            def inner_loop(given_num, given_y):
                logits = self.decode(given_y, self.encoder_outputs, self.attention_bias)
                next_logits = logits[:, given_num, :]  # [batch, decoder_vocab_size]
                next_probs = tf.nn.softmax(next_logits)
                log_probs = tf.log(next_probs)
                next_sample = tf.multinomial(log_probs, num_samples=1)
                next_sample = tf.cast(next_sample, dtype=tf.int32)
                given_y = tf.concat([given_y[:, :given_num], next_sample], axis=1)
                given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num - 1]])
                return given_num + 1, given_y

            given_y = gen_samples[:, :given_num]
            init_given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num]])
            init_given_num = given_num

            given_num, roll_sample = tf.while_loop(
                cond=condition,
                body=inner_loop,
                loop_vars=[init_given_num, init_given_y],
                shape_invariants=[init_given_num.get_shape(),
                                  tf.TensorShape([None, None])]
            )
            return roll_sample
    def build_no_teacher_discriminator(self,
                                       origin_inputs,
                                       gen_target,
                                       real_loss,
                                       margin=1.0):
        fake_attention_bias = model_utils.get_padding_bias(
            gen_target)  # [batch, 1, 1, src_len]
        fake_encoder_outputs = self.encode(
            gen_target, fake_attention_bias)  # [batch, src_len, hidden_size]
        _, fake_logits = self.argmax_predict(fake_encoder_outputs,
                                             fake_attention_bias)
        fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss(
            fake_logits, origin_inputs, self.params.label_smoothing,
            self.params.target_vocab_size)  # [batch, origin_length]
        fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum(
            fake_weights, axis=1)
        tf.identity(fake_loss[:5], "fake_loss")

        mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss")
        tf.summary.scalar("mean_fake_loss", mean_fake_loss)

        rewards = 1 / tf.maximum(margin, fake_loss /
                                 (real_loss + 1e-12) - 1)  # [batch]
        tf.identity(rewards[:5], "rewards")

        mean_wards = tf.reduce_mean(rewards, name="mean_wards")
        tf.summary.scalar("mean_wards", mean_wards)
        return rewards
    def build_pretrain(self, inputs, targets):
        # initializer = tf.variance_scaling_initializer(
        #     self.params.initializer_gain, mode="fan_avg", distribution="uniform")
        #
        # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE):
        if ModeKeys.is_predict_one(self.mode):
            attention_bias = None
        else:
            attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]

        encoder_outputs = self.encode(
            inputs, attention_bias)  # [batch, src_len, hidden_size]

        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      encoder_outputs, None,
                                                      None, None)

        if targets is None:
            prediction, _ = self.argmax_predict(encoder_outputs,
                                                attention_bias)
            return prediction
        else:
            logits = self.decode(
                targets, encoder_outputs,
                attention_bias)  # [batch, tgt_len, vocab_size]
            return logits
 def build_generator(self, inputs):
     with tf.variable_scope("Transformer", initializer=self._initializer, reuse=tf.AUTO_REUSE):
         self.attention_bias = model_utils.get_padding_bias(inputs)  # [batch, 1, 1, src_len]
         self.encoder_outputs = self.encode(inputs, self.attention_bias)  # [batch, src_len, hidden_size]
         tf.logging.info("!!!!!!! using argmax_predict in generator !!!!!!!!")
         decoded_ids = self.argmax_predict(self.encoder_outputs, self.attention_bias)
         return decoded_ids
示例#6
0
  def __call__(self, inputs, targets=None):
    """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
    # Variance scaling is used here because it seems to work in many problems.
    # Other reasonable initializers may also work just as well.
    initializer = tf.variance_scaling_initializer(
        self.params.initializer_gain, mode="fan_avg", distribution="uniform")
    with tf.variable_scope("Transformer", initializer=initializer):
      # Calculate attention bias for encoder self-attention and decoder
      # multi-headed attention layers.
      attention_bias = model_utils.get_padding_bias(inputs)

      # Run the inputs through the encoder layer to map the symbol
      # representations to continuous representations.
      encoder_outputs = self.encode(inputs, attention_bias)

      # Generate output sequence if targets is None, or return logits if target
      # sequence is known.
      if targets is None:
        return self.predict(encoder_outputs, attention_bias)
      else:
        logits = self.decode(targets, encoder_outputs, attention_bias)
        return logits
    def build_pretrain_mono(self, inputs, targets):
        inputs_length = tf.argmin(inputs, axis=-1) + 1
        max_len = inputs_length[tf.argmax(inputs_length)]
        batch_size = tf.shape(inputs)[0]

        pad_inputs = tf.zeros([0, max_len], dtype=tf.int32)
        def inner_loop(i, pad_inputs):
            ori_length = inputs_length[i]
            ori_input = tf.reshape(inputs[i][:ori_length], [1, -1])
            pad_input = tf.pad(ori_input, [[0,0], [0, max_len - ori_length]])
            pad_inputs = tf.concat([pad_inputs, pad_input], axis=0)   
            return i + 1, pad_inputs
        _, pad_inputs = tf.while_loop(
            cond=lambda i,_: i < batch_size,
            body=inner_loop,
            loop_vars=[tf.constant(0), pad_inputs],
            shape_invariants=[
                tf.TensorShape([]),
                tf.TensorShape([None, None])]
        )
        with tf.variable_scope("Transformer", initializer=self._initializer, reuse=tf.AUTO_REUSE):
            attention_bias = model_utils.get_padding_bias(pad_inputs)  # [batch, 1, 1, src_len]
            encoder_outputs = self.encode(pad_inputs, attention_bias)  # [batch, src_len, hidden_size]
            #encoder_outputs = tf.stop_gradient(encoder_outputs)
            if targets is None:
                prediction = self.argmax_predict(encoder_outputs, attention_bias)
                return prediction
            else:
                tf.logging.info("!!! mono decoder by techer forcing !!!")
                logits = self.decode(targets, encoder_outputs, attention_bias)  # [batch, tgt_len, vocab_size]
                return logits
示例#8
0
    def build_generator(self, inputs):
        # Calculate attention bias for encoder self-attention and decoder
        # multi-headed attention layers.
        if ModeKeys.is_predict_one(self.mode):
            self.attention_bias = None
        else:
            self.attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]

        # Run the inputs through the encoder layer to map the symbol
        # representations to continuous representations.
        self.encoder_outputs = self.encode(
            inputs, self.attention_bias)  # [batch, src_len, hidden_size]

        # get encdec_attenion k/v just for predict_one_encoder
        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      self.encoder_outputs,
                                                      None, None, None)

        # Generate output sequence if targets is None, or return logits if target
        # sequence is known.
        if self.is_train:
            tf.logging.info("!!!!!! using rl predict in traning !!!!!!")
            return self.rl_predict(self.encoder_outputs, self.attention_bias)
        else:
            tf.logging.info(
                "!!!!!!! using argmax_predict in inference !!!!!!!!")
            return self.argmax_predict(self.encoder_outputs,
                                       self.attention_bias)
示例#9
0
    def get_bleu(self, gen_targets, real_inputs):
        with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE):
            attention_bias = model_utils.get_padding_bias(gen_targets)
            encoder_outputs = self.encode(gen_targets, attention_bias)

            logits = self.decode(real_inputs, encoder_outputs, attention_bias)
            prediction = tf.argmax(logits, axis=-1)  # [batch, ori_inp_len]
            bleu = tf.py_func(metrics.compute_bleu_batch, (real_inputs, prediction), tf.float32)
            return tf.reshape(bleu, (-1, 1))  # [batch,]
示例#10
0
 def get_loss(self, origin_inputs, targets):
     with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE):
         attention_bias = model_utils.get_padding_bias(targets)  # [batch, 1, 1, src_len]
         encoder_outputs = self.encode(targets, attention_bias)  # [batch, src_len, hidden_size]
         logits = self.decode(origin_inputs, encoder_outputs, attention_bias)
         xentropy, weights = metrics.padded_cross_entropy_loss(
             logits, origin_inputs, self.params.label_smoothing,
             self.params.target_vocab_size)  # [batch, origin_length]
         self.loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1) # [batch]
         #prediction = self.argmax_predict(encoder_outputs, attention_bias) # [batch, max_len]
         return tf.reshape(self.loss, (-1, 1))  # [batch, 1]
示例#11
0
    def call(self, inputs, targets: Optional[np.ndarray] = None):
        attention_bias = model_utils.get_padding_bias(inputs)
        encoder_outputs = self._encode(inputs, attention_bias)

        if targets is None:
            logits = self._decode(encoder_outputs, targets, attention_bias)
            #raise Exception()
            return logits  #self.predict(encoder_outputs, attention_bias)
        else:
            logits = self._decode(encoder_outputs, targets, attention_bias)
            return logits
示例#12
0
    def get_loss(self, gen_targets, real_inputs):
        with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE):
            attention_bias = model_utils.get_padding_bias(gen_targets)
            encoder_outputs = self.encode(gen_targets, attention_bias)

            logits = self.decode(real_inputs, encoder_outputs, attention_bias)

            xentropy, weights = metrics.padded_cross_entropy_loss(logits, real_inputs,
                                                                  self.params.label_smoothing,
                                                                  self.params.target_vocab_size)
            loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1)  # [batch, 1]
            return tf.reshape(loss, (-1, 1))
示例#13
0
 def build_pretrain(self, inputs, targets):
     self.init_embed("Transformer")
     with tf.variable_scope("Transformer", initializer=self._initializer, reuse=tf.AUTO_REUSE):
         attention_bias = model_utils.get_padding_bias(inputs)  # [batch, 1, 1, src_len]
         encoder_outputs = self.encode(inputs, attention_bias)  # [batch, src_len, hidden_size]
         if targets is None:
             prediction = self.argmax_predict(encoder_outputs, attention_bias)
             return prediction
         else:
             tf.logging.info("!!!!!!!!!! pretrain decoder !!!!!!!!!!!!!!!!!!")
             logits = self.decode(targets, encoder_outputs, attention_bias)  # [batch, tgt_len, vocab_size]
             return logits
    def test_get_padding_bias(self):
        x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
        bias = model_utils.get_padding_bias(x)
        bias_shape = tf.shape(bias)
        flattened_bias = tf.reshape(bias, [3, 5])
        with self.test_session() as sess:
            flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))

        self.assertAllEqual(
            [[0, NEG_INF, NEG_INF, NEG_INF, 0],
             [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]],
            flattened_bias)
        self.assertAllEqual([3, 1, 1, 5], bias_shape)
示例#15
0
    def __call__(self, inputs, targets=None, eos_id=None):
        """Calculate target logits or inferred target sequences.

        Args:
          inputs: int tensor with shape [batch_size, input_length].
          targets: None or int tensor with shape [batch_size, target_length].

        Returns:
          If targets is defined, then return logits for each word in the target
          sequence. float tensor with shape [batch_size, target_length, vocab_size]
          If target is none, then generate output sequence one token at a time.
            returns a dictionary {
              output: [batch_size, decoded length]
              score: [batch_size, float]}
        """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(
            self.params.initializer_gain, mode="fan_avg", distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias)

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                return self.predict(encoder_outputs, attention_bias, eos_id)
                # initial_ids = tf.zeros([1], dtype=tf.int32)

                # Create cache storing decoder attention values for each layer.
                cache = {
                    "layer_%d" % layer: {
                        "k": tf.zeros([1, 0, self.params.hidden_size]),
                        'w': tf.constant([])
                    } for layer in range(self.params.num_hidden_layers)}

                # Add encoder output and attention bias to the cache.
                cache["encoder_outputs"] = encoder_outputs
                cache["encoder_decoder_attention_bias"] = attention_bias
                self._get_symbols_to_logits_fn(10)(tf.constant([[1], [2]], dtype=tf.int32), 0, cache)
                return self._get_symbols_to_logits_fn(10)(tf.constant([[1], [1]], dtype=tf.int32), 1, cache)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
示例#16
0
 def get_real_loss(self, origin_inputs, origin_target):
     with tf.variable_scope("Discriminator",
                            initializer=self._initializer,
                            reuse=tf.AUTO_REUSE):
         real_attention_bias = model_utils.get_padding_bias(
             origin_target)  # [batch, 1, 1, src_len]
         real_encoder_outputs = self.encode(
             origin_target,
             real_attention_bias)  # [batch, src_len, hidden_size]
         real_logits = self.decode(origin_inputs, real_encoder_outputs,
                                   real_attention_bias)
         real_xentropy, real_weights = metrics.padded_cross_entropy_loss(
             real_logits, origin_inputs, self.params.label_smoothing,
             self.params.target_vocab_size)
         self.real_loss = tf.reduce_sum(real_xentropy) / tf.reduce_sum(
             real_weights)  # [batch]
         return self.real_loss
示例#17
0
    def inference(self, inputs, targets=None, reuse=None):
        with tf.variable_scope(self.name_scope,
                               initializer=self.initializer,
                               reuse=reuse):

            if ModeKeys.is_predict_one(self.mode):
                attention_bias = None
            else:
                attention_bias = model_utils.get_padding_bias(inputs)
            encoder_outputs = self.encode(inputs, attention_bias)
            if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
                fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
                fake_decoder_outputs = self.decoder_stack(
                    fake_decoder_inputs, encoder_outputs, None, None, None)
            if targets is None:
                return self.predict(encoder_outputs, attention_bias)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
示例#18
0
    def get_fake_loss(self, origin_inputs, gen_targets):
        inputs_length = tf.argmin(gen_targets, axis=-1) + 1
        max_len = inputs_length[tf.argmax(inputs_length)]
        batch_size = tf.shape(gen_targets)[0]

        pad_gen_targets = tf.zeros([0, max_len], dtype=tf.int32)

        def inner_loop(i, pad_inputs):
            ori_length = inputs_length[i]
            ori_input = tf.reshape(gen_targets[i][:ori_length], [1, -1])
            pad_input = tf.pad(ori_input, [[0, 0], [0, max_len - ori_length]])
            pad_inputs = tf.concat([pad_inputs, pad_input], axis=0)
            return i + 1, pad_inputs

        _, pad_gen_targets = tf.while_loop(
            cond=lambda i, _: i < batch_size,
            body=inner_loop,
            loop_vars=[tf.constant(0), pad_gen_targets],
            shape_invariants=[
                tf.TensorShape([]),
                tf.TensorShape([None, None])
            ])
        gen_targets = pad_gen_targets

        with tf.variable_scope("Discriminator",
                               initializer=self._initializer,
                               reuse=tf.AUTO_REUSE):
            fake_attention_bias = model_utils.get_padding_bias(
                gen_targets)  # [batch, 1, 1, src_len]
            fake_encoder_outputs = self.encode(
                gen_targets,
                fake_attention_bias)  # [batch, src_len, hidden_size]
            fake_logits = self.decode(origin_inputs, fake_encoder_outputs,
                                      fake_attention_bias)
            fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss(
                fake_logits, origin_inputs, self.params.label_smoothing,
                self.params.target_vocab_size)  # [batch, origin_length]
            self.fake_loss = tf.reduce_sum(fake_xentropy) / tf.reduce_sum(
                fake_weights)
            #fake_prediction = self.argmax_predict(fake_encoder_outputs, fake_attention_bias) # [batch, max_len]
            return self.fake_loss
示例#19
0
    def call(self, inputs, targets: Optional[np.ndarray] = None):
        attention_bias = model_utils.get_padding_bias(inputs)
        encoder_outputs, enc_ponders, enc_remainders = self._encode(
            inputs, attention_bias)
        logits, dec_ponders, dec_remainders = self._decode(
            encoder_outputs, targets, attention_bias)

        if targets is None:
            raise Exception()
        enc_act_loss = tf.reduce_mean(enc_ponders + enc_remainders)
        dec_act_loss = tf.reduce_mean(dec_ponders + dec_remainders)
        act_loss = self.hparams['act_loss_weight'] * (enc_act_loss +
                                                      dec_act_loss)
        if self.is_train:
            with tf.contrib.summary.record_summaries_every_n_global_steps(10):
                tf.contrib.summary.scalar('summary/ponder_times_enc',
                                          tf.reduce_mean(enc_ponders))
                tf.contrib.summary.scalar('summary/ponder_times_dec',
                                          tf.reduce_mean(dec_ponders))

        return logits, act_loss
示例#20
0
    def __call__(self, feature, targets=None):
        """
        :param feature:
        :param targets:
        :return:
        """
        initializer = tf.variance_scaling_initializer(
            scale=self.params.get('initializer_gain'),
            mode='fan_avg',
            distribution='uniform')

        with tf.variable_scope('transformer', initializer=initializer):
            #   [batch_size, 1, 1, length]
            attention_bias = model_utils.get_padding_bias(feature)

            encoder_outputs = self.encode(feature, attention_bias)

            if targets is None:
                return self.predict(encoder_outputs, attention_bias)

            logits = self.decode(targets, encoder_outputs, attention_bias)
            return logits
示例#21
0
    def build_pretrain(self, inputs, targets):
        # initializer = tf.variance_scaling_initializer(
        #     self.params.initializer_gain, mode="fan_avg", distribution="uniform")
        #
        # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE):

        # Calculate attention bias for encoder self-attention and decoder
        # multi-headed attention layers.
        if ModeKeys.is_predict_one(self.mode):
            attention_bias = None
        else:
            attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]

        # Run the inputs through the encoder layer to map the symbol
        # representations to continuous representations.
        encoder_outputs = self.encode(
            inputs, attention_bias)  # [batch, src_len, hidden_size]

        # get encdec_attenion k/v just for predict_one_encoder
        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      encoder_outputs, None,
                                                      None, None)

        # Generate output sequence if targets is None, or return logits if target
        # sequence is known.
        if targets is None:
            tf.logging.info(
                "!!!!!!!!!!!prediction using argmax prediction!!!!!!!!!!!!!")
            prediction, _ = self.argmax_predict(encoder_outputs,
                                                attention_bias)
            return prediction
        else:
            logits = self.decode(
                targets, encoder_outputs,
                attention_bias)  # [batch, tgt_len, vocab_size]
            return logits
示例#22
0
    def forward(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.

        Args:
         inputs: int tensor with shape [batch_size, input_length].
         targets: None or int tensor with shape [batch_size, target_length].

        Returns:
          If targets is defined, then return logits for each word in the target
          sequence. float tensor with shape [batch_size, target_length, vocab_size]
          If target is none, then generate output sequence one token at a time.
          returns a dictionary {
            output: [batch_size, decoded length]
            score: [batch_size, float]}
        """
        attention_bias = model_utils.get_padding_bias(inputs)
        encoder_outputs = self.encode(inputs, attention_bias)
        if targets is None:
            return self.predict(encoder_outputs, attention_bias)
        else:
            logits = self.decode(targets, encoder_outputs, attention_bias)
            return logits
示例#23
0

if __name__ == "__main__":
    import os
    tf.enable_eager_execution()
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    params = model_params.TransformerBaseParams()
    x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32)

    Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size,
                                                     params.hidden_size,
                                                     "source_embedding")
    embedded_inputs = Enc_Embedding(
        x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN))
    print(embedded_inputs.shape)
    attention_bias = model_utils.get_padding_bias(x_inputs)
    print(attention_bias.shape)
    encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN)
    enc_out = encoder_stack(embedded_inputs, attention_bias, None)
    print(enc_out.shape)
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        10)
    self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1]
    print(self_attention_bias)
    attention_bias = model_utils.get_padding_bias(x_inputs)
    cache = {
        "layer_%d" % layer: {
            "k": tf.zeros([2, 0, params.hidden_size]),
            "v": tf.zeros([2, 0, params.hidden_size]),
        }
        for layer in range(params.num_hidden_layers)