def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding(length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def _get_symbols_to_logits_fn(self, max_decode_length, training): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) timing_signal = tf.cast(timing_signal, self.params["dtype"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length, dtype=self.params["dtype"]) # TODO(b/139770046): Refactor code with better naming of i. def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1]. i: Loop index. cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) if self.params["padded_decode"]: timing_signal_shape = timing_signal.shape.as_list() decoder_input += tf.slice(timing_signal, [i, 0], [1, timing_signal_shape[1]]) bias_shape = decoder_self_attention_bias.shape.as_list() self_attention_bias = tf.slice( decoder_self_attention_bias, [0, 0, i, 0], [bias_shape[0], bias_shape[1], 1, bias_shape[3]]) else: decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), training=training, cache=cache, decode_loop_step=i if self.params["padded_decode"] else None) logits = self.embedding_softmax_layer(decoder_outputs, mode="linear") logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, _, inputs, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: inputs: int tensor (old dst sentence) with shape [batch_size, input_length]. encoder_outputs: continuous representation of diff sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def decode(self, start_tokens, targets, encoder_outputs, attention_bias): with tf.name_scope("decode"): with tf.name_scope("shift_targets"): decoder_inputs = tf.concat( [tf.expand_dims(start_tokens, axis=1), targets[:, :-1]], axis=1) decoder_inputs = self.decoder_embedding_layer(decoder_inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) decoder_outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) outputs = self.output_embedding_layer(decoder_outputs) return outputs
def test_get_decoder_self_attention_bias(self): length = 5 bias = model_utils.get_decoder_self_attention_bias(length) with self.test_session() as sess: bias = sess.run(bias) self.assertAllEqual( [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF], [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF], [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" # 返回一个能够计算下一个token的decode函数 timing_signal = model_utils.get_position_encoding( # 时序信息,形状是[length, hidden_size] max_decode_length + 1, self.params["hidden_size"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) # self attention 的偏差, 形状是[1, 1, length, length] def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. 这个函数可以做到,给出已经预测的tokens的id,使用decoder和encode的信息,预测下一个token ids表示已经预测出来的tokens i表示当前是第i个位置,要被预测 cache应该是因为:训练时的decode只需要做一次,但是inference时的decode需要做多次,因为要逐个单词预测,多次decode用的encode信息是一样的,因此需要提前存储好。 Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1],忽略batch_size,可以看出,这个ids不是整个句子的ids,而是从开始到某一位置的候选tokens的id i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # 貌似是想要获得句子中当前位置的候选tokens的ids,也就是获得形状 [batch_size * beam_size, 1] # Preprocess decoder input by getting embeddings and adding timing signal. # 做embedding,也就是[batch_size * beam_size, 1, hidden_size] # 从这一步可以看出,在inference的decode的输入,就是用已经预测的tokens的最后一个token,构成句子长度为1的句子,做embedding,输入到decode进行解码 decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] # 加上第i个token的时序信息 self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # self attention,形状是[1, 1, 1, i+1] decoder_outputs = self.decoder_stack( # 进行decode,输出tensor的形状和输入decoder_input一样,也是[batch_size * beam_size, 1, hidden_size] decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) # softmax,从[batch_size * beam_size, 1, hidden_size]映射到[batch_size * beam_size, 1, vocab_size] logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze( logits, axis=[1] ) # 去掉中间那个长度为1的维度,即由[batch_size * beam_size, 1, vocab_size]变为[batch_size * beam_size, vocab_size] return logits, cache return symbols_to_logits_fn
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] ### domyounglee 2020.2.12 cls_dec_bias = model_utils.get_cls_dec_attention_bias( tf.cast(tf.equal(decoder_input, 2), tf.int64)) #self.cls_attention_bias=None # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cls_attention_bias=None, cls_dec_attention_bias=None, identity_mask=None, cache=cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias, training): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] training: boolean, whether in training mode or not. Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"]) attention_bias = tf.cast(attention_bias, self.params["dtype"]) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element # 第二维在维首加了一行padding,去掉最后的EOS,看来输入没有BOS decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) pos_encoding = tf.cast(pos_encoding, self.params["dtype"]) decoder_inputs += pos_encoding if training: decoder_inputs = tf.nn.dropout( decoder_inputs, rate=self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length, dtype=self.params["dtype"]) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias, training=training) logits = self.embedding_softmax_layer(outputs, mode="linear") logits = tf.cast(logits, tf.float32) return logits
def predict(self, start_tokens, encoder_outputs, encoder_decoder_attention_bias): """Return predicted sequence.""" with tf.name_scope('decode'): batch_size = tf.shape(encoder_outputs)[0] max_decode_length = self.params['sequence_length'] timing_signal = model_utils.get_position_encoding( max_decode_length, self.params['hidden_size']) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) # Create cache storing decoder attention values for each layer. cache = { 'layer_%d' % layer: { 'k': tf.zeros([batch_size, 0, self.params['hidden_size']]), 'v': tf.zeros([batch_size, 0, self.params['hidden_size']]) } for layer in range(self.params['num_hidden_layers']) } # Add encoder output and attention bias to the cache. cache['encoder_outputs'] = encoder_outputs cache[ 'encoder_decoder_attention_bias'] = encoder_decoder_attention_bias # Forward decoder_inputs to decoder_stack max_decode_length times instead of applying beam search. decoder_outputs = tf.zeros( [batch_size, 0, self.params['output_size']]) decoder_inputs = tf.expand_dims(start_tokens, axis=1) for i in range(max_decode_length): decoder_inputs = self.decoder_embedding_layer(decoder_inputs) decoder_inputs += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_inputs = self.decoder_stack( decoder_inputs, cache.get('encoder_outputs'), self_attention_bias, cache.get('encoder_decoder_attention_bias'), cache) decoder_inputs = self.output_embedding_layer(decoder_inputs) decoder_outputs = tf.concat([decoder_outputs, decoder_inputs], axis=1) return decoder_outputs
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits