def test_get_padding(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) padding = model_utils.get_padding(x, padding_value=0) with self.test_session() as sess: padding = sess.run(padding) self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]], padding)
def test_get_padding(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) padding = model_utils.get_padding(x, padding_value=0) with self.test_session() as sess: padding = sess.run(padding) self.assertAllEqual( [[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]], padding)
def _encode(self, input_dict): if len(self.layers) == 0: # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False)) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( self.params["hidden_size"], self.params["num_heads"], self.params["attention_dropout"], self.mode == "train") feed_forward_network = ffn_layer.FeedFowardNetwork( self.params["hidden_size"], self.params["filter_size"], self.params["relu_dropout"], self.mode == "train") self.layers.append([ PrePostProcessingWrapper(self_attention_layer, self.params, self.mode == "train"), PrePostProcessingWrapper(feed_forward_network, self.params, self.mode == "train")]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(self.params["hidden_size"]) # actual encoder part with tf.name_scope("encode"): #inputs = input_dict['src_sequence'] inputs = input_dict['source_tensors'][0] # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = utils.get_padding(inputs) inputs_attention_bias = utils.get_padding_bias(inputs) #inputs_attention_bias = tf.cast(utils.get_padding_bias(inputs), # dtype=self.params['dtype']) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding, dtype=embedded_inputs.dtype) if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) encoded = self._call(encoder_inputs, inputs_attention_bias, inputs_padding) return {'outputs': encoded, 'inputs_attention_bias': inputs_attention_bias, 'state': None, 'src_lengths': input_dict['source_tensors'][1], 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs}
def decode_pass(self, targets, encoder_outputs, encoder_outputs_b, inputs_attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] float tensor with shape [batch_size, input_length, hidden_size] encoder_outputs_b: continuous representation of input sequence which includes the source embeddings. float tensor with shape [batch_size, input_length, hidden_size] inputs_attention_bias: float tensor with shape [batch_size, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ # Prepare inputs to decoder layers by applying embedding # and adding positional encoding. decoder_inputs = self.embedding_softmax_layer(targets) if self.position_embedding_layer is not None: with tf.name_scope("add_pos_encoding"): pos_input = tf.range( 0, tf.shape(decoder_inputs)[1], delta=1, dtype=tf.int32, name='range') pos_encoding = self.position_embedding_layer(pos_input) decoder_inputs = decoder_inputs + tf.cast( x=pos_encoding, dtype=decoder_inputs.dtype) if self.mode == "train": decoder_inputs = tf.nn.dropout(decoder_inputs, self.params["embedding_dropout_keep_prob"]) # mask the paddings in the target inputs_padding = get_padding( targets, padding_value=self._pad_sym, dtype=decoder_inputs.dtype) decoder_inputs *= tf.expand_dims(1.0 - inputs_padding, 2) # do decode logits = self._call( decoder_inputs=decoder_inputs, encoder_outputs_a=encoder_outputs, encoder_outputs_b=encoder_outputs_b, input_attention_bias=inputs_attention_bias) return logits
def _encode(self, input_dict): training = (self.mode == "train") if len(self.layers) == 0: # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False), ) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( hidden_size=self.params["hidden_size"], num_heads=self.params["num_heads"], attention_dropout=self.params["attention_dropout"], train=training, regularizer=self.regularizer, batch_size=self.batch_size, num_feature=self.num_features) feed_forward_network = ffn_layer.FeedFowardNetwork( hidden_size=self.params["hidden_size"], filter_size=self.params["filter_size"], relu_dropout=self.params["relu_dropout"], train=training, #num_features=self.num_features, #batch_size=self.batch_size, regularizer=self.regularizer) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, self.params, training), PrePostProcessingWrapper(feed_forward_network, self.params, training) ]) # final normalization layer. print("Encoder:", self.norm_params["type"], self.mode) if self.norm_params["type"] == "batch_norm": self.output_normalization = Transformer_BatchNorm( training=training, params=self.norm_params) else: self.output_normalization = LayerNormalization( hidden_size=self.params["hidden_size"], params=self.norm_params) # actual encoder part with tf.name_scope("encode"): inputs, src_lengths = input_dict['source_tensors'] #inputs = input_dict['source_tensors'][0] # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) if self.params["remove_padding"]: inputs_padding = utils.get_padding(inputs) #inputs_padding = utils.get_padding(inputs,dtype=self._params["dtype"]) else: inputs_padding = None inputs_attention_bias = utils.get_padding_bias(inputs) inputs_attention_bias = tf.transpose(inputs_attention_bias, [0, 1, 3, 2, 4]) # inputs_attention_bias = utils.get_padding_bias(inputs, dtype=self._params["dtype"]) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = utils.get_position_encoding( length, self.params["hidden_size"], ) #encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding, # dtype=embedded_inputs.dtype) pos_encoding = tf.cast(x=pos_encoding, dtype=embedded_inputs.dtype) pos_encoding_exp = pos_encoding[None, :, None, :] encoder_inputs = embedded_inputs + pos_encoding_exp if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, keep_prob=1.0 - self.params["layer_postprocess_dropout"], ) encoded = self._call(encoder_inputs, inputs_attention_bias, inputs_padding) return { 'outputs': encoded, 'inputs_attention_bias': inputs_attention_bias, 'state': None, 'src_lengths': src_lengths, #'src_lengths': input_dict['source_tensors'][1], 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs }
def _encode(self, input_dict): inputs = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] with tf.variable_scope("encode"): # prepare encoder graph if len(self.layers) == 0: knum_list = list( zip(*self.params.get("conv_nchannels_kwidth")))[0] kwidth_list = list( zip(*self.params.get("conv_nchannels_kwidth")))[1] with tf.variable_scope("embedding"): self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( vocab_size=self._src_vocab_size, hidden_size=self._src_emb_size, pad_vocab_to_eight=self._pad2eight, init_var=0.1, embed_scale=False, pad_sym=self._pad_sym, mask_paddings=True) with tf.variable_scope("pos_embedding"): self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), hidden_size=self._src_emb_size, pad_vocab_to_eight=self._pad2eight, init_var=0.1, embed_scale=False, pad_sym=self._pad_sym, mask_paddings=True) # linear projection before cnn layers self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( self._src_emb_size, knum_list[0], dropout=self.params["embedding_dropout_keep_prob"], var_scope_name="linear_mapping_before_cnn_layers", mode=self.mode, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var)) for i in range(len(knum_list)): in_dim = knum_list[i] if i == 0 else knum_list[i - 1] out_dim = knum_list[i] # linear projection is needed for residual connections if # input and output of a cnn layer do not match if in_dim != out_dim: linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( in_dim, out_dim, var_scope_name="linear_mapping_cnn_" + str(i + 1), dropout=1.0, mode=self.mode, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var) else: linear_proj = None conv_layer = conv_wn_layer.Conv1DNetworkNormalized( in_dim, out_dim, kernel_width=kwidth_list[i], mode=self.mode, layer_id=i + 1, hidden_dropout=self.params["hidden_dropout_keep_prob"], conv_padding="SAME", decode_padding=False, activation=self.conv_activation, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var) self.layers.append([linear_proj, conv_layer]) # linear projection after cnn layers self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( knum_list[-1], self._src_emb_size, dropout=1.0, var_scope_name="linear_mapping_after_cnn_layers", mode=self.mode, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var)) encoder_inputs = self.embedding_softmax_layer(inputs) inputs_attention_bias = get_padding_bias(inputs, res_rank=3, pad_sym=self._pad_sym) with tf.name_scope("add_pos_encoding"): pos_input = tf.range(0, tf.shape(encoder_inputs)[1], delta=1, dtype=tf.int32, name='range') pos_encoding = self.position_embedding_layer(pos_input) encoder_inputs = encoder_inputs + tf.cast( x=pos_encoding, dtype=encoder_inputs.dtype) if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, self.params["embedding_dropout_keep_prob"]) # mask the paddings in the input given to cnn layers inputs_padding = get_padding(inputs, self._pad_sym, dtype=encoder_inputs.dtype) padding_mask = tf.expand_dims(1 - inputs_padding, 2) encoder_inputs *= padding_mask outputs, outputs_b, final_state = self._call( encoder_inputs, padding_mask) return { 'outputs': outputs, 'outputs_b': outputs_b, 'inputs_attention_bias_cs2s': inputs_attention_bias, 'state': final_state, 'src_lengths': source_length, # should it include paddings or not? 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs }