def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ decoder_inputs = self.embedding_softmax_layer(targets) decoder_inputs = nd.expand_dims(decoder_inputs, axis=0) decoder_inputs = nd.pad(data=decoder_inputs, mode="constant", constant_value=0, pad_width=(0, 0, 0, 0, 1, 0, 0, 0)) decoder_inputs = nd.reshape(data=decoder_inputs, shape=decoder_inputs.shape[1:])[:, :-1, :] length = decoder_inputs.shape[1] decoder_inputs = decoder_inputs + model_utils.get_position_encoding( length, self.param.hidden_size, targets.context) if self.train: decoder_inputs = self.dropout_output(decoder_inputs) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length, targets.context) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence.""" with tf.name_scope("decode"): decoder_inputs = self.decoder_embedding_layer( targets, not ModeKeys.is_predict_one(self.mode)) with tf.name_scope("shift_targets"): decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0] ])[:, :-1, :] # [batch, tgt_seqn_len, embed_size] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.decoder_softmax_layer.linear(outputs) return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.param.hidden_size, mx.gpu()) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length, mx.gpu()) def symbols_to_logits_fn(ids, i, cache): decoder_input = ids[:, -1:] # decoder的输入为Current decoded sequences 的最后一个 decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input = decoder_input + timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = nd.squeeze(logits, axis=1) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias): with tf.name_scope("decode"): decoder_inputs = self.decoder_embedding_layer( targets, not ModeKeys.is_predict_one(self.mode)) # done with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # !!! # logits = self.embedding_softmax_layer.linear(outputs) logits = self.decoder_softmax_layer.linear(outputs) # done return logits
def test_get_decoder_self_attention_bias(self): length = 5 bias = model_utils.get_decoder_self_attention_bias(length) with self.test_session() as sess: bias = sess.run(bias) self.assertAllEqual( [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF], [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF], [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
def _get_symbols_to_logits_fn(self, max_decode_length): if ModeKeys.is_predict_one(self.mode): timing_signal = model_utils.get_position_encoding( self.params.max_length, self.params.hidden_size) timing_signal = tf.slice( timing_signal, [0, 0], [max_decode_length + 1, self.params.hidden_size], name='slice_timing_signal') else: timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size ) # [max_decode_length + 1, hidden_size] if ModeKeys.is_predict_one(self.mode): decoder_self_attention_bias = None else: decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length ) # [1, 1, max_decode_length, max_decode_length] def symbols_to_logits_fn(ids, i, cache): decoder_input = ids[:, -1:] # [batch, 1] decoder_input = self.decoder_embedding_layer( decoder_input, not ModeKeys.is_predict_one( self.mode)) # [batch, 1, hidden_size] if ModeKeys.is_predict_one(self.mode): decoder_input = decoder_input * (1 - tf.to_float(tf.equal(i, 0))) slice_pos_encoding = tf.slice( timing_signal, [i, 0], [1, self.params.hidden_size], name='slice_pos_encoding') # [1, hidden_size] decoder_input += slice_pos_encoding if decoder_self_attention_bias is None: self_attention_bias = None else: self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # [1, 1, 1, time_step] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.decoder_softmax_layer.linear(decoder_outputs) logits = tf.reshape(logits, [-1, self.params.target_vocab_size]) return logits, cache return symbols_to_logits_fn
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" # shape: (max_decode_length + 1, hidden_size) timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) # shape: (1, 1, max_decode_length, max_decode_length) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs # shape: (batch_size * beam_size, 1) decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. # shape: (batch_size * beam_size, 1, hidden_size) decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] # decoder self attention bias # shape: (1, 1, 1, i+1) self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # shape: (batch_size * beam_size, 1, hidden_size) # 一个query 对应 一个attention answer decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) # shape: (batch_size * beam_size, 1, vocab_size) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. # !!! # decoder_inputs = self.embedding_softmax_layer(targets) decoder_inputs = self.decoder_embedding_layer( targets, not ModeKeys.is_predict_one(self.mode)) # done with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0] ])[:, :-1, :] # [batch, tgt_seqn_len, embed_size] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # !!! # logits = self.embedding_softmax_layer.linear(outputs) logits = self.decoder_softmax_layer.linear(outputs) # done return logits
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.compat.v1.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.compat.v1.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( tensor=decoder_inputs, paddings=[[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.compat.v1.name_scope("add_pos_encoding"): length = tf.shape(input=decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=self.params.layer_postprocess_dropout) decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - (1 - self.params.layer_postprocess_dropout)) with tf.compat.v1.tpu.bfloat16_scope(): decoder_inputs = tf.cast(decoder_inputs, tf.bfloat16) #encoder_outputs = tf.cast(encoder_outputs, tf.bfloat16) #attention_bias = tf.cast(attention_bias, tf.bfloat16) # Run values decoder_self_attention_bias = tf.cast(model_utils.get_decoder_self_attention_bias( length), tf.bfloat16) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) logits = tf.cast(logits, tf.float32) return logits
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. # shape: (batch_size, target_length, hidden_size) decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift decoder_input one token to the right # fill inputs the first token is 0 -> <BOS>, # and remove the last element <EOS> decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values # shape: [1, 1, target_length, target_length] decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) # shape: (batch_size, target_length, hidden_size) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # shape: (batch_size, target_length, vocab_size) logits = self.embedding_softmax_layer.linear(outputs) return logits
def _decode(self, encoder_outputs, targets, attention_bias): decoder_inputs = self.embedding_layer(targets) decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # add positional encoding length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.hparams['num_units']) if self.is_train: decoder_inputs = self.decoder_embedding_dropout(decoder_inputs) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs, dec_ponders, dec_remainders = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_layer.linear(outputs) return logits, dec_ponders, dec_remainders
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def decode(self, targets, encoder_outputs, attention_bias): """ :param targets: [batch_size, target_length] :param encoder_outputs: [batch_size, input_length, hidden_size] :param attention_bias: [batch_size, 1, 1, input_length] :return: [batch_size, target_length, vocab_size] """ with tf.name_scope('decode'): # [batch_size, target_length, hidden_size] decoder_inputs = self.embedding_layer(targets) with tf.name_scope('shift_targets'): # pad embedding value 0 at the head of sequence and remove eos_id decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope('add_pos_embedding'): length = tf.shape(decoder_inputs)[1] position_decode = model_utils.get_position_encoding( length, self.params.get('hidden_size')) decoder_inputs = tf.add(decoder_inputs, position_decode) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1. - self.params.get('encoder_decoder_dropout')) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # [batch_size, target_length, vocab_size] logits = self.embedding_layer.linear(outputs) return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" if ModeKeys.is_predict_one(self.mode): timing_signal = model_utils.get_position_encoding( self.params.max_length, self.params.hidden_size) timing_signal = tf.slice( timing_signal, [0, 0], [max_decode_length + 1, self.params.hidden_size], name='slice_timing_signal') else: timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size ) # [max_decode_length + 1, hidden_size] if ModeKeys.is_predict_one(self.mode): decoder_self_attention_bias = None else: decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length ) # [1, 1, max_decode_length, max_decode_length] def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # [batch, 1] # decoder_input = ids[:, :] # [batch, 1] # print("decoder_input:", decoder_input.shape) # Preprocess decoder input by getting embeddings and adding timing signal. # !!!!!!!! decoder_input = self.decoder_embedding_layer( decoder_input, not ModeKeys.is_predict_one( self.mode)) # [batch, 1, hidden_size] # !!!!!!!! if ModeKeys.is_predict_one(self.mode): decoder_input = decoder_input * (1 - tf.to_float(tf.equal(i, 0))) # add position embedding # decoder_input += timing_signal[i:i + 1] slice_pos_encoding = tf.slice( timing_signal, [i, 0], [1, self.params.hidden_size], name='slice_pos_encoding') # [1, hidden_size] decoder_input += slice_pos_encoding if decoder_self_attention_bias is None: self_attention_bias = None else: self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # [1, 1, 1, time_step] # self_attention_bias = decoder_self_attention_bias[:, :, :i+1, :i+1] # [1, 1, 1, time_step] # print("attention bias:", self_attention_bias.shape) decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.decoder_softmax_layer.linear(decoder_outputs) # logits = tf.squeeze(logits, axis=[1]) logits = tf.reshape(logits, [-1, self.params.target_vocab_size]) return logits, cache return symbols_to_logits_fn
os.environ["CUDA_VISIBLE_DEVICES"] = "0" params = model_params.TransformerBaseParams() x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32) Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size, params.hidden_size, "source_embedding") embedded_inputs = Enc_Embedding( x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN)) print(embedded_inputs.shape) attention_bias = model_utils.get_padding_bias(x_inputs) print(attention_bias.shape) encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN) enc_out = encoder_stack(embedded_inputs, attention_bias, None) print(enc_out.shape) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( 10) self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1] print(self_attention_bias) attention_bias = model_utils.get_padding_bias(x_inputs) cache = { "layer_%d" % layer: { "k": tf.zeros([2, 0, params.hidden_size]), "v": tf.zeros([2, 0, params.hidden_size]), } for layer in range(params.num_hidden_layers) } dec_input = tf.constant([[2], [3]], dtype=tf.int32) decoder_stack = DecoderStack(params, is_train=True, mode=ModeKeys.TRAIN) dec_out = decoder_stack(dec_input, enc_out, self_attention_bias, attention_bias, cache) print(dec_out.shape)