Exemplo n.º 1
0
    def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.
    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].
    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then return encoder outputs.
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(scale=1.0,
                                                      mode="fan_avg",
                                                      distribution="uniform")
        with tf.variable_scope("Transformer",
                               initializer=initializer,
                               reuse=tf.AUTO_REUSE):
            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                # Calculate attention bias for encoder self-attention and decoder
                # multi-headed attention layers.
                attention_bias = model_utils.get_padding_bias(inputs)
                # Run the inputs through the encoder layer to map the symbol
                # representations to continuous representations.
                encoder_outputs = self.encode(inputs, attention_bias)
                # Add attention model to generate sentence embedding:
                attention_outputs, alpha = self.full_attention_layer(
                    encoder_outputs)
                return attention_outputs

            else:
                src_attention_bias = model_utils.get_padding_bias(inputs)
                src_encoder_outputs = self.encode(inputs, src_attention_bias)
                src_attention_outputs, _ = self.full_attention_layer(
                    src_encoder_outputs)

                tgt_attention_bias = model_utils.get_padding_bias(targets)
                tgt_encoder_outputs = self.encode(targets, tgt_attention_bias)
                tgt_attention_outputs, _ = self.full_attention_layer(
                    tgt_encoder_outputs)

                print(src_attention_outputs.get_shape().as_list())
                print(tgt_attention_outputs.get_shape().as_list())
                logits = tf.reduce_sum(tf.square(
                    tf.subtract(src_attention_outputs, tgt_attention_outputs)),
                                       1,
                                       keep_dims=True)
                logits = tf.reshape(logits, [-1], name="logits")

                return logits
Exemplo n.º 2
0
 def decoder_train(self, x, y):
     ## x: (batch_size, enc_len) , y: (batch_size, dec_len)
     dec_bias = model_utils.get_decoder_self_attention_bias(
         self.max_dec_len)
     attention_bias = model_utils.get_padding_bias(x)
     # Encoder
     encoder_emb_inp = self.build_embed(x, encoder=True, reuse=False)
     encoder_outputs = self.build_encoder(x,
                                          encoder_emb_inp,
                                          attention_bias,
                                          reuse=False)
     # Decoder
     batch_size = tf.shape(x)[0]
     start_tokens = tf.fill([batch_size, 1], self.bos_idx)  # 2: <s> ID
     target_slice_last_1 = tf.slice(y, [0, 0],
                                    [batch_size, self.max_dec_len - 1])
     decoder_inputs = tf.concat([start_tokens, target_slice_last_1],
                                axis=1)  ## shift to right
     decoder_emb_inp = self.build_embed(decoder_inputs,
                                        encoder=False,
                                        reuse=True)
     decoder_outputs = self.build_decoder(decoder_emb_inp,
                                          encoder_outputs,
                                          dec_bias,
                                          attention_bias,
                                          reuse=False)
     train_prob = self.build_output(decoder_outputs, reuse=False)
     return encoder_outputs, decoder_inputs, train_prob
Exemplo n.º 3
0
    def call(self, inputs, targets):
        """Calculate target logits or inferred target sequences.
        
        Args:
            inputs: int tensor with shape [batch_size, input_length].
            targets: None or int tensor with shape [batch_size, target_length].

        Returns:
            If targets is defined, then return logits for each word in the target sequence. 
            float tensor with shape [batch_size, target_length, vocab_size]
            If target is none, then generate output sequence one token at a time.
                returns a dictionary {
                    output: [batch_size, decoded length]
                    score: [batch_size, float]}
        """
        # 此处没有什么实际的解释,总之是有好处的
        # initializer = tf.variance_scaling_initializer(
        # self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
        # with tf.variable_scope("Transformer", initializer=initializer):
        # 所有的padding位置标记为 -1e9, 其余位置为0

        attention_bias = model_utils.get_padding_bias(inputs)
        # 经过 encoder 得到输入句子的编码
        encoder_outputs = self.encode(
            inputs, attention_bias)  # batch,length,hidden_size

        # 用于预测
        logits = self.decode(targets, encoder_outputs, attention_bias)
        return logits
Exemplo n.º 4
0
def translate():
    source = ["Es un gran honor conocerte aqui.", 
              "Me gustaría hablar contigo sobre lo que ocurrió ayer en la escuela.",
              "Tom tiene una hermana que puede hablar francés.",
              "Soy un estudiante de la Universidad."]

    source = [load_data.preprocess_sentence(s) for s in source]
    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for sp in source]
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                        maxlen=params["max_length_input"], padding='post')    

    # 经过 encoder
    attention_bias = model_utils.get_padding_bias(input_tensor)   # batch, 1, 1, length
    encoder_outputs = model.encode(input_tensor, attention_bias)  # batch,length,hidden_size
    print("---------Decoder-----------")

    # 进入decode
    IDS = predict(encoder_outputs, attention_bias)
        
    for i in range(len(source)):
        word = " ".join([targ_lang.idx2word[w] for w in IDS[i]])
        print("----------")
        print(source[i])
        print(word)
        print("----------\n")
Exemplo n.º 5
0
    def Embedding(self, x):

        # args:   x shape: [ batch_size, length]
        # return: [batch_size, length, hidden_size]
        hparams = self.hparams
        if hparams['embedding_model'] == 'transformer':

            self.embedding_layer = embedding_layer.EmbeddingSharedWeights(
                hparams["vocab_size"], hparams["hidden_size"])

            embedded_inputs = self.embedding_layer(x)
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(embedded_inputs)[1]
                pos_encoding = model_utils.get_position_encoding(
                    length, hparams["hidden_size"])
                encoder_inputs = embedded_inputs + pos_encoding

            if self.hparams['train']:
                encoder_inputs = tf.nn.dropout(
                    encoder_inputs,
                    rate=self.hparams["layer_postprocess_dropout"])

            self.inputs_padding = model_utils.get_padding(x)
            self.attention_bias = model_utils.get_padding_bias(x)
            return encoder_inputs
Exemplo n.º 6
0
 def __call__(self, inputs, padnum, pos):
     initializer = tf.variance_scaling_initializer(1,
                                                   mode="fan_avg",
                                                   distribution="uniform")
     with tf.variable_scope("Transformer", initializer=initializer):
         attention_bias = model_utils.get_padding_bias(padnum)
         encoderout = self.encode(inputs, attention_bias, padnum, pos)
     return encoderout
Exemplo n.º 7
0
  def call(self, inputs, training):
    """Calculate target logits or inferred target sequences.

    Args:
      inputs: input tensor list of size 1 or 2.
        First item, inputs: int tensor with shape [batch_size, input_length].
        Second item (optional), targets: None or int tensor with shape
          [batch_size, target_length].
      training: boolean, whether in training mode or not.

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          outputs: [batch_size, decoded length]
          scores: [batch_size, float]}
      Even when float16 is used, the output tensor(s) are always float32.

    Raises:
      NotImplementedError: If try to use padded decode method on CPU/GPUs.
    """
    if len(inputs) == 2:
      inputs, targets = inputs[0], inputs[1]
    else:
      # Decoding path.
      inputs, targets = inputs[0], None
      if self.params["padded_decode"]:
        if not self.params["num_replicas"]:
          raise NotImplementedError(
              "Padded decoding on CPU/GPUs is not supported.")
        decode_batch_size = int(self.params["decode_batch_size"] /
                                self.params["num_replicas"])
        inputs.set_shape([
            decode_batch_size, self.params["decode_max_length"]
        ])

    # Variance scaling is used here because it seems to work in many problems.
    # Other reasonable initializers may also work just as well.
    with tf.name_scope("Transformer"):
      # Calculate attention bias for encoder self-attention and decoder
      # multi-headed attention layers.
      attention_bias = model_utils.get_padding_bias(inputs)

      # Run the inputs through the encoder layer to map the symbol
      # representations to continuous representations.
      encoder_outputs = self.encode(inputs, attention_bias, training)
      # Generate output sequence if targets is None, or return logits if target
      # sequence is known.
      if targets is None:
        return self.predict(encoder_outputs, attention_bias, training)
      else:
        logits = self.decode(targets, encoder_outputs, attention_bias, training)
        return logits
Exemplo n.º 8
0
  def test_get_padding_bias(self):
    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
    bias = model_utils.get_padding_bias(x)
    bias_shape = tf.shape(bias)
    flattened_bias = tf.reshape(bias, [3, 5])

    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
                         [NEG_INF, 0, 0, NEG_INF, 0]],
                        flattened_bias)
    self.assertAllEqual([3, 1, 1, 5], bias_shape)
Exemplo n.º 9
0
  def test_get_padding_bias(self):
    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
    bias = model_utils.get_padding_bias(x)
    bias_shape = tf.shape(input=bias)
    flattened_bias = tf.reshape(bias, [3, 5])
    with self.test_session() as sess:
      flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))

    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
                         [NEG_INF, 0, 0, NEG_INF, 0]],
                        flattened_bias)
    self.assertAllEqual([3, 1, 1, 5], bias_shape)
Exemplo n.º 10
0
  def test_get_padding_bias(self):
    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
    bias = model_utils.get_padding_bias(x)
    bias_shape = tf.shape(bias)
    flattened_bias = tf.reshape(bias, [3, 5])
    with self.test_session() as sess:
      flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))

    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
                         [NEG_INF, 0, 0, NEG_INF, 0]],
                        flattened_bias)
    self.assertAllEqual([3, 1, 1, 5], bias_shape)
Exemplo n.º 11
0
    def call(self, inputs, training):
        if len(inputs) == 2:
            inputs, targets = inputs[0], inputs[1]
        else:
            inputs, targets = inputs[0], None

        with tf.name_scope('Transformer'):
            attention_bias = model_utils.get_padding_bias(inputs)
            encoder_outputs = self.encode(inputs, attention_bias, training)
            if targets is None:
                return self.predict(encoder_outputs, attention_bias, training)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias, training)
                return logits
Exemplo n.º 12
0
    def decoder_infer(self, x):
        dec_bias = model_utils.get_decoder_self_attention_bias(
            self.max_dec_len)
        attention_bias = model_utils.get_padding_bias(x)
        # Encoder
        encoder_emb_inp = self.build_embed(x, encoder=True, reuse=True)
        encoder_outputs = self.build_encoder(x,
                                             encoder_emb_inp,
                                             attention_bias,
                                             reuse=True)
        # Decoder
        batch_size = tf.shape(x)[0]
        start_tokens = tf.fill([batch_size, 1], self.bos_idx)  # 2: <s> ID
        next_decoder_inputs = tf.concat([
            start_tokens,
            tf.zeros([batch_size, self.max_dec_len - 1], dtype=tf.int32)
        ],
                                        axis=1)  ## batch_size, dec_len
        # predict output with loop. [encoder_outputs, decoder_inputs (filled next token)]
        for i in range(1, self.max_dec_len):
            decoder_emb_inp = self.build_embed(next_decoder_inputs,
                                               encoder=False,
                                               reuse=True)
            decoder_outputs = self.build_decoder(decoder_emb_inp,
                                                 encoder_outputs,
                                                 dec_bias,
                                                 attention_bias,
                                                 reuse=True)
            logits = self.build_output(decoder_outputs, reuse=True)
            next_decoder_inputs = self._filled_next_token(
                next_decoder_inputs, logits, i)

        # slice start_token
        decoder_input_start_1 = tf.slice(next_decoder_inputs, [0, 1],
                                         [batch_size, self.max_dec_len - 1])
        output_token = tf.concat(
            [decoder_input_start_1,
             tf.zeros([batch_size, 1], dtype=tf.int32)],
            axis=1)
        return output_token
Exemplo n.º 13
0
    def __call__(self, inputs, input_types, targets=None):
        """Calculate target logits or inferred target sequences.
        Args:
          inputs: int tensor with shape [batch_size, input_length].
          targets: None or int tensor with shape [batch_size, target_length].
        Returns:
          If targets is defined, then return logits for each word in the target
          sequence. float tensor with shape [batch_size, target_length, vocab_size]
          If target is none, then generate output sequence one token at a time.
            returns a dictionary {
              output: [batch_size, decoded length]
              score: [batch_size, float]}
        """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(
            self.params["initializer_gain"],
            mode="fan_avg",
            distribution="uniform")
        # initializer = tf.truncated_normal_initializer(stddev=self.params["initializer_range"])
        with tf.variable_scope("Transformer",
                               reuse=tf.AUTO_REUSE,
                               initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias, input_types)

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                return self.predict(encoder_outputs,
                                    attention_bias), encoder_outputs
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits, encoder_outputs
Exemplo n.º 14
0
    tf_pred_res = tf_sess.run(tf_pred,
                              feed_dict={tf_input_x_raw: my_input_x_raw})
    print("tf prediction:")
    with printoptions(threshold=2000):
        print(tf_pred_res)

    k_transformer = KTransformer(params)
    k_input_x_raw = Input(shape=(_seq_len_x, ))
    k_input_y_raw = Input(shape=(_seq_len_y, ))

    k_embedded_inputs = k_transformer.embedding_softmax_layer(k_input_x_raw)
    k_pos_encoding = k_model_utils.get_position_encoding(
        seq_len_x, k_transformer.params.hidden_size)
    k_embedding_inputs = k_embedded_inputs + k_pos_encoding

    k_attention_bias = k_model_utils.get_padding_bias(k_input_x_raw)
    k_encoder_outputs = k_transformer.encode(k_input_x_raw,
                                             k_attention_bias,
                                             train=False)

    k_output = k_transformer([k_input_x_raw, k_input_y_raw], train=False)

    tf_sess.run(tf.global_variables_initializer())
    tf_sess.run(get_assign_list(k_transformer))

    k_run = K.function([k_input_x_raw, k_input_y_raw], [k_output])
    k_res = k_run([my_input_x_raw, my_input_y_raw])[0]
    print("k output:")
    with printoptions(precision=3, suppress=True):
        print(k_res)