def _encode(self, input_dict): if len(self.layers) == 0: # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False)) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( self.params["hidden_size"], self.params["num_heads"], self.params["attention_dropout"], self.mode == "train") feed_forward_network = ffn_layer.FeedFowardNetwork( self.params["hidden_size"], self.params["filter_size"], self.params["relu_dropout"], self.mode == "train") self.layers.append([ PrePostProcessingWrapper(self_attention_layer, self.params, self.mode == "train"), PrePostProcessingWrapper(feed_forward_network, self.params, self.mode == "train")]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(self.params["hidden_size"]) # actual encoder part with tf.name_scope("encode"): #inputs = input_dict['src_sequence'] inputs = input_dict['source_tensors'][0] # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = utils.get_padding(inputs) inputs_attention_bias = utils.get_padding_bias(inputs) #inputs_attention_bias = tf.cast(utils.get_padding_bias(inputs), # dtype=self.params['dtype']) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding, dtype=embedded_inputs.dtype) if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) encoded = self._call(encoder_inputs, inputs_attention_bias, inputs_padding) return {'outputs': encoded, 'inputs_attention_bias': inputs_attention_bias, 'state': None, 'src_lengths': input_dict['source_tensors'][1], 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs}
def _positional_encoding(x, dtype): """Add positional encoding to the given input.""" length = tf.shape(x)[1] features_count = tf.shape(x)[2] features_count += features_count % 2 pos_encoding = utils.get_position_encoding(length, features_count) position_encoding = tf.cast(pos_encoding, dtype) position_encoding = position_encoding[:, :features_count] return position_encoding
def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] inputs_attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]], )[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] # decoder_inputs += utils.get_position_encoding( # length, self.params["hidden_size"]) decoder_inputs += tf.cast( utils.get_position_encoding(length, self.params["hidden_size"]), dtype=self.params['dtype'], ) if self.mode == "train": decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"], ) # Run values decoder_self_attention_bias = utils.get_decoder_self_attention_bias( length) # do decode outputs = self._call( decoder_inputs=decoder_inputs, encoder_outputs=encoder_outputs, decoder_self_attention_bias=decoder_self_attention_bias, attention_bias=inputs_attention_bias, ) logits = self.embedding_softmax_layer.linear(outputs) return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"], ) decoder_self_attention_bias = utils.get_decoder_self_attention_bias( max_decode_length, ) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += tf.cast(x=timing_signal[i:i + 1], dtype=decoder_input.dtype) self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self._call( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache, ) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return tf.cast(logits, tf.float32), cache return symbols_to_logits_fn
def _encode(self, input_dict): training = (self.mode == "train") if len(self.layers) == 0: # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False), ) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( hidden_size=self.params["hidden_size"], num_heads=self.params["num_heads"], attention_dropout=self.params["attention_dropout"], train=training, regularizer=self.regularizer, batch_size=self.batch_size, num_feature=self.num_features) feed_forward_network = ffn_layer.FeedFowardNetwork( hidden_size=self.params["hidden_size"], filter_size=self.params["filter_size"], relu_dropout=self.params["relu_dropout"], train=training, #num_features=self.num_features, #batch_size=self.batch_size, regularizer=self.regularizer) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, self.params, training), PrePostProcessingWrapper(feed_forward_network, self.params, training) ]) # final normalization layer. print("Encoder:", self.norm_params["type"], self.mode) if self.norm_params["type"] == "batch_norm": self.output_normalization = Transformer_BatchNorm( training=training, params=self.norm_params) else: self.output_normalization = LayerNormalization( hidden_size=self.params["hidden_size"], params=self.norm_params) # actual encoder part with tf.name_scope("encode"): inputs, src_lengths = input_dict['source_tensors'] #inputs = input_dict['source_tensors'][0] # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) if self.params["remove_padding"]: inputs_padding = utils.get_padding(inputs) #inputs_padding = utils.get_padding(inputs,dtype=self._params["dtype"]) else: inputs_padding = None inputs_attention_bias = utils.get_padding_bias(inputs) inputs_attention_bias = tf.transpose(inputs_attention_bias, [0, 1, 3, 2, 4]) # inputs_attention_bias = utils.get_padding_bias(inputs, dtype=self._params["dtype"]) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = utils.get_position_encoding( length, self.params["hidden_size"], ) #encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding, # dtype=embedded_inputs.dtype) pos_encoding = tf.cast(x=pos_encoding, dtype=embedded_inputs.dtype) pos_encoding_exp = pos_encoding[None, :, None, :] encoder_inputs = embedded_inputs + pos_encoding_exp if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, keep_prob=1.0 - self.params["layer_postprocess_dropout"], ) encoded = self._call(encoder_inputs, inputs_attention_bias, inputs_padding) return { 'outputs': encoded, 'inputs_attention_bias': inputs_attention_bias, 'state': None, 'src_lengths': src_lengths, #'src_lengths': input_dict['source_tensors'][1], 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs }