def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(
            self.params["initializer_gain"],
            mode="fan_avg",
            distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            ###domyounglee 2020.2.12
            self.cls_attention_bias = model_utils.get_padding_bias(
                tf.cast(tf.equal(inputs, 2), tf.int64))
            if targets is not None:
                self.cls_dec_attention_bias = model_utils.get_cls_dec_attention_bias(
                    tf.cast(tf.equal(targets, 2), tf.int64))
            else:
                self.cls_dec_attention_bias = None

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias)

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                return self.predict(encoder_outputs, attention_bias)
            else:
                logits = self.decode(targets,
                                     encoder_outputs,
                                     attention_bias,
                                     cls_attention_bias=None,
                                     cls_dec_attention_bias=None,
                                     identity_mask=None)
                return logits
示例#2
0
  def __call__(self, inputs, targets=None):
    """Calculate target logits or inferred target sequences.

    Args:
      inputs:
        int tensor with shape [batch_size, 3, input_length],
        where [old source seq, old target seq, new source seq]
        in the 2nd dimension
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
    # Variance scaling is used here because it seems to work in many problems.
    # Other reasonable initializers may also work just as well.
    initializer = tf.variance_scaling_initializer(
        self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
    with tf.variable_scope("TransformerDifre", initializer=initializer):
      # Extract each element from inputs
      inputs_oldsrc = inputs[:,0,:]
      inputs_oldtrg = inputs[:,1,:]
      inputs_newsrc = inputs[:,2,:]

      # Calculate attention bias for encoder self-attention and decoder
      # multi-headed attention layers.
      attention_bias_oldsrc = model_utils.get_padding_bias(inputs_oldsrc)
      attention_bias_oldtrg = model_utils.get_padding_bias(inputs_oldtrg)
      attention_bias_newsrc = model_utils.get_padding_bias(inputs_newsrc)

      # Run the inputs through the encoder layer to map the symbol
      # representations to continuous representations.
      src_encoder_outputs = self.src_encode(
        inputs_oldsrc, attention_bias_oldsrc)
      diff_encoder_outputs = self.diff_encode(
        inputs_newsrc, src_encoder_outputs, attention_bias_newsrc)

      # Generate output sequence if targets is None, or return logits if target
      # sequence is known.
      if targets is None:
        return self.predict(
          inputs_oldtrg, diff_encoder_outputs, attention_bias_oldtrg)
      else:
        logits = self.decode(
          targets, inputs_oldtrg, diff_encoder_outputs, attention_bias_oldtrg)
        return logits
    def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(
            self.params["initializer_gain"],
            mode="fan_avg",
            distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            src_attention_bias = model_utils.get_padding_bias(
                inputs)  # used for 1.src_encode self-att; 2.decode en-de att.
            if targets is not None:  # tc modified
                tgt_attention_bias = model_utils.get_padding_bias(
                    targets)  # only used for tgt_encode self-att.
            else:
                tgt_attention_bias = None

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            (encoder_outputs, latent_sample, prior_mu, prior_logvar, recog_mu,
             recog_logvar) = self.encode(inputs, src_attention_bias, targets,
                                         tgt_attention_bias)  # tc modified

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                logits = self.predict(encoder_outputs, src_attention_bias,
                                      latent_sample)
            else:
                logits = self.decode(targets, encoder_outputs,
                                     src_attention_bias, latent_sample)

            return logits, latent_sample, prior_mu, prior_logvar, recog_mu, recog_logvar
示例#4
0
  def __call__(self, inputs, targets=None):
    """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
    # Variance scaling is used here because it seems to work in many problems.
    # Other reasonable initializers may also work just as well.
    initializer = tf.variance_scaling_initializer(
        self.params.initializer_gain, mode="fan_avg", distribution="uniform")
    with tf.variable_scope("Transformer", initializer=initializer):
      # Calculate attention bias for encoder self-attention and decoder
      # multi-headed attention layers.
      attention_bias = model_utils.get_padding_bias(inputs)

      # Run the inputs through the encoder layer to map the symbol
      # representations to continuous representations.
      encoder_outputs = self.encode(inputs, attention_bias)

      # Generate output sequence if targets is None, or return logits if target
      # sequence is known.
      if targets is None:
        return self.predict(encoder_outputs, attention_bias)
      else:
        logits = self.decode(targets, encoder_outputs, attention_bias)
        return logits
示例#5
0
 def predict(self, inputs, **kwargs):
     source, targets = inputs[0], inputs[1]
     with tf.name_scope("Transformer_Predict"):
         attention_bias = model_utils.get_padding_bias(source)
         encoder_outputs = self.encode(source, attention_bias, self.params['train'])
         logits = self.decode(targets, encoder_outputs, attention_bias, self.params['train'])
         return logits
示例#6
0
    def encode_no_lookup(self, embedded_inputs, inputs_mask):
        """Encoder step for transformer given already-embedded inputs

      Args:
        model: transformer model
        embedded_inputs: int tensor with shape [batch_size, input_length, emb_size].
        inputs_mask: int tensor with shape [batch_size, input_length]
        params: transformer_params
        train: boolean flag

      Returns:
        float tensor with shape [batch_size, input_length, hidden_size]
      """
        with tf.name_scope("encode"):
            # Prepare inputs to the layer stack by adding positional encodings and
            # applying dropout.
            inputs_padding = model_utils.get_padding(inputs_mask)
            attention_bias = model_utils.get_padding_bias(inputs_mask)

            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(embedded_inputs)[1]
                pos_encoding = model_utils.get_position_encoding(
                    length, self.params.hidden_size)
                encoder_inputs = embedded_inputs + pos_encoding

            if self.train:
                encoder_inputs = tf.nn.dropout(
                    encoder_inputs, 1 - self.params.layer_postprocess_dropout)

            return self.encoder_stack(encoder_inputs, attention_bias,
                                      inputs_padding)
示例#7
0
    def __call__(self, inputs):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer_gain = 1.
        initializer = tf.variance_scaling_initializer(initializer_gain,
                                                      mode="fan_avg",
                                                      distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias)

            return encoder_outputs
示例#8
0
    def call(self, inputs):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: input tensor list of size 1 or 2.
        First item, inputs: int tensor with shape [batch_size, input_length].
        Second item (optional), targets: None or int tensor with shape
          [batch_size, target_length].
      training: boolean, whether in training mode or not.

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          outputs: [batch_size, decoded length]
          scores: [batch_size, float]}
      Even when float16 is used, the output tensor(s) are always float32.

    Raises:
      NotImplementedError: If try to use padded decode method on CPU/GPUs.
    """
        input_ids, final_hidden, targets = inputs
        training = self.train
        # if len(inputs) == 2:
        #   inputs, targets = inputs[0], inputs[1]
        # else:
        #   inputs, targets = inputs[0], None
        #   if self.params["padded_decode"]:
        #     if not self.params["num_replicas"]:
        #       raise NotImplementedError(
        #           "Padded decoding on CPU/GPUs is not supported.")
        #     decode_batch_size = int(self.params["decode_batch_size"] /
        #                             self.params["num_replicas"])
        #     inputs = tf.reshape(
        #         inputs, [decode_batch_size, self.params["decode_max_length"]])

        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        with tf.name_scope("Transformer"):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.

            attention_bias = model_utils.get_padding_bias(input_ids)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.

            #encoder_outputs = self.encode(inputs, attention_bias, training)
            encoder_outputs = final_hidden

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                return self.predict(encoder_outputs, attention_bias, training)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias,
                                     training)
                return logits
    def test_get_padding_bias(self):
        x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
        bias = model_utils.get_padding_bias(x)
        bias_shape = tf.shape(bias)
        flattened_bias = tf.reshape(bias, [3, 5])
        with self.test_session() as sess:
            flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))

        self.assertAllEqual(
            [[0, NEG_INF, NEG_INF, NEG_INF, 0],
             [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]],
            flattened_bias)
        self.assertAllEqual([3, 1, 1, 5], bias_shape)
示例#10
0
    def call(self, inputs, training):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: input tensor list of size 1 or 2.
        First item, inputs: int tensor with shape [batch_size, input_length].
        Second item (optional), targets: None or int tensor with shape
          [batch_size, target_length].
      training: boolean, whether in training mode or not.

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          outputs: [batch_size, decoded length]
          scores: [batch_size, float]}
      Even when float16 is used, the output tensor(s) are always float32.
    """
        if len(inputs) == 2:
            inputs, targets = inputs[0], inputs[1]
        else:
            inputs, targets = inputs[0], None

        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        with tf.name_scope("Transformer"):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias, training)
            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                return self.predict(encoder_outputs, attention_bias, training)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias,
                                     training)
                return logits
示例#11
0
    def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length]. 向量形状:batch size × input length
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target     训练阶段返回target概率
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.     预测阶段返回预测结果
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.x
        # 定义变量的初始化方式,均匀分布
        initializer = tf.variance_scaling_initializer(
            self.params["initializer_gain"],
            mode="fan_avg",
            distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            # 得到一个跟input相同形状的attention bias向量,padding的0值为1e-9,否则为1(感觉更像mask)
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            # 将输入经encoder编码为表示
            encoder_outputs = self.encode(inputs, attention_bias)

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            # 如有target(训练阶段),返回target的概率;如果没有target(预测阶段),返回预测情况
            if targets is None:
                return self.predict(encoder_outputs, attention_bias)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
    def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.
    # init负责构造这些层,call负责

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(  # 初始化器,给了scope,即可实现初始化
            self.params["initializer_gain"],
            mode="fan_avg",
            distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(
                inputs)  # 获得attention偏差矩阵,
            # 这个矩阵,不是padding的部分,都是0,是padding的部分,都是负无穷,而且插了两个维度,貌似是给 num_heads 和 length 准备的

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs,
                                          attention_bias)  # 将输入进行encode

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:  # 没给目标句子,那就是要做预测了
                return self.predict(encoder_outputs, attention_bias)
            else:  # 给了目标句子,那就是要训练或者验证了
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
示例#13
0
def transformer_encoder(input, lengths):
    # Set up estimator and params
    params = model_params.BASE_PARAMS
    params["default_batch_size"] = K
    params["max_length"] = 500
    params["vocab_size"] = VOCABULARY_SIZE + 1
    params["filter_size"] = 256
    params["num_hidden_layers"] = 2
    params["num_heads"] = 2
    params["hidden_size"] = EMBEDDING_SIZE

    model = transformer.Transformer(params, tf.estimator.ModeKeys.TRAIN)
    initializer = tf.variance_scaling_initializer(
        model.params["initializer_gain"],
        mode="fan_avg",
        distribution="uniform")
    with tf.variable_scope("Transformer", initializer=initializer):
        # Calculate attention bias for encoder self-attention and decoder
        # multi-headed attention layers.
        attention_bias = model_utils.get_padding_bias(inputs)
        # Run the inputs through the encoder layer to map the symbol
        # representations to continuous representations.
        encoder = model.encode(inputs, attention_bias)
        return tf.reduce_mean(encoder, 1)
示例#14
0
    tf_res = tf_sess.run(tf_output,
                         feed_dict={
                             tf_input_x_raw: my_input_x_raw,
                             tf_input_y_raw: my_input_y_raw
                         })
    print("tf output:")
    with printoptions(precision=3, suppress=True):
        print(tf_res)

    tf_embedded_inputs = tf_transformer.embedding_softmax_layer(tf_input_x_raw)
    tf_pos_encoding = tf_model_utils.get_position_encoding(
        seq_len_x, tf_transformer.params.hidden_size)
    tf_embedding_inputs = tf_embedded_inputs + tf_pos_encoding

    tf_attention_bias = tf_model_utils.get_padding_bias(tf_input_x_raw)
    tf_encoder_outputs = tf_transformer.encode(tf_input_x_raw,
                                               tf_attention_bias)

    tf_pred = tf_transformer(tf_input_x_raw)["outputs"]
    tf_pred_res = tf_sess.run(tf_pred,
                              feed_dict={tf_input_x_raw: my_input_x_raw})
    print("tf prediction:")
    with printoptions(threshold=2000):
        print(tf_pred_res)

    k_transformer = KTransformer(params)
    k_input_x_raw = Input(shape=(_seq_len_x, ))
    k_input_y_raw = Input(shape=(_seq_len_y, ))

    k_embedded_inputs = k_transformer.embedding_softmax_layer(k_input_x_raw)