def _encode(self, input_dict): if len(self.layers) == 0: # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False)) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( self.params["hidden_size"], self.params["num_heads"], self.params["attention_dropout"], self.mode == "train") feed_forward_network = ffn_layer.FeedFowardNetwork( self.params["hidden_size"], self.params["filter_size"], self.params["relu_dropout"], self.mode == "train") self.layers.append([ PrePostProcessingWrapper(self_attention_layer, self.params, self.mode == "train"), PrePostProcessingWrapper(feed_forward_network, self.params, self.mode == "train")]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(self.params["hidden_size"]) # actual encoder part with tf.name_scope("encode"): #inputs = input_dict['src_sequence'] inputs = input_dict['source_tensors'][0] # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = utils.get_padding(inputs) inputs_attention_bias = utils.get_padding_bias(inputs) #inputs_attention_bias = tf.cast(utils.get_padding_bias(inputs), # dtype=self.params['dtype']) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding, dtype=embedded_inputs.dtype) if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) encoded = self._call(encoder_inputs, inputs_attention_bias, inputs_padding) return {'outputs': encoded, 'inputs_attention_bias': inputs_attention_bias, 'state': None, 'src_lengths': input_dict['source_tensors'][1], 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs}
def _encode(self, input_dict): training = (self.mode == "train") if len(self.layers) == 0: # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False), ) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( hidden_size=self.params["hidden_size"], num_heads=self.params["num_heads"], attention_dropout=self.params["attention_dropout"], train=training, regularizer=self.regularizer, batch_size=self.batch_size, num_feature=self.num_features) feed_forward_network = ffn_layer.FeedFowardNetwork( hidden_size=self.params["hidden_size"], filter_size=self.params["filter_size"], relu_dropout=self.params["relu_dropout"], train=training, #num_features=self.num_features, #batch_size=self.batch_size, regularizer=self.regularizer) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, self.params, training), PrePostProcessingWrapper(feed_forward_network, self.params, training) ]) # final normalization layer. print("Encoder:", self.norm_params["type"], self.mode) if self.norm_params["type"] == "batch_norm": self.output_normalization = Transformer_BatchNorm( training=training, params=self.norm_params) else: self.output_normalization = LayerNormalization( hidden_size=self.params["hidden_size"], params=self.norm_params) # actual encoder part with tf.name_scope("encode"): inputs, src_lengths = input_dict['source_tensors'] #inputs = input_dict['source_tensors'][0] # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) if self.params["remove_padding"]: inputs_padding = utils.get_padding(inputs) #inputs_padding = utils.get_padding(inputs,dtype=self._params["dtype"]) else: inputs_padding = None inputs_attention_bias = utils.get_padding_bias(inputs) inputs_attention_bias = tf.transpose(inputs_attention_bias, [0, 1, 3, 2, 4]) # inputs_attention_bias = utils.get_padding_bias(inputs, dtype=self._params["dtype"]) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = utils.get_position_encoding( length, self.params["hidden_size"], ) #encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding, # dtype=embedded_inputs.dtype) pos_encoding = tf.cast(x=pos_encoding, dtype=embedded_inputs.dtype) pos_encoding_exp = pos_encoding[None, :, None, :] encoder_inputs = embedded_inputs + pos_encoding_exp if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, keep_prob=1.0 - self.params["layer_postprocess_dropout"], ) encoded = self._call(encoder_inputs, inputs_attention_bias, inputs_padding) return { 'outputs': encoded, 'inputs_attention_bias': inputs_attention_bias, 'state': None, 'src_lengths': src_lengths, #'src_lengths': input_dict['source_tensors'][1], 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs }
def _encode(self, input_dict): inputs = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] with tf.variable_scope("encode"): # prepare encoder graph if len(self.layers) == 0: knum_list = list( zip(*self.params.get("conv_nchannels_kwidth")))[0] kwidth_list = list( zip(*self.params.get("conv_nchannels_kwidth")))[1] with tf.variable_scope("embedding"): self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( vocab_size=self._src_vocab_size, hidden_size=self._src_emb_size, pad_vocab_to_eight=self._pad2eight, init_var=0.1, embed_scale=False, pad_sym=self._pad_sym, mask_paddings=True) with tf.variable_scope("pos_embedding"): self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), hidden_size=self._src_emb_size, pad_vocab_to_eight=self._pad2eight, init_var=0.1, embed_scale=False, pad_sym=self._pad_sym, mask_paddings=True) # linear projection before cnn layers self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( self._src_emb_size, knum_list[0], dropout=self.params["embedding_dropout_keep_prob"], var_scope_name="linear_mapping_before_cnn_layers", mode=self.mode, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var)) for i in range(len(knum_list)): in_dim = knum_list[i] if i == 0 else knum_list[i - 1] out_dim = knum_list[i] # linear projection is needed for residual connections if # input and output of a cnn layer do not match if in_dim != out_dim: linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( in_dim, out_dim, var_scope_name="linear_mapping_cnn_" + str(i + 1), dropout=1.0, mode=self.mode, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var) else: linear_proj = None conv_layer = conv_wn_layer.Conv1DNetworkNormalized( in_dim, out_dim, kernel_width=kwidth_list[i], mode=self.mode, layer_id=i + 1, hidden_dropout=self.params["hidden_dropout_keep_prob"], conv_padding="SAME", decode_padding=False, activation=self.conv_activation, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var) self.layers.append([linear_proj, conv_layer]) # linear projection after cnn layers self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( knum_list[-1], self._src_emb_size, dropout=1.0, var_scope_name="linear_mapping_after_cnn_layers", mode=self.mode, normalization_type=self.normalization_type, regularizer=self.regularizer, init_var=self.init_var)) encoder_inputs = self.embedding_softmax_layer(inputs) inputs_attention_bias = get_padding_bias(inputs, res_rank=3, pad_sym=self._pad_sym) with tf.name_scope("add_pos_encoding"): pos_input = tf.range(0, tf.shape(encoder_inputs)[1], delta=1, dtype=tf.int32, name='range') pos_encoding = self.position_embedding_layer(pos_input) encoder_inputs = encoder_inputs + tf.cast( x=pos_encoding, dtype=encoder_inputs.dtype) if self.mode == "train": encoder_inputs = tf.nn.dropout( encoder_inputs, self.params["embedding_dropout_keep_prob"]) # mask the paddings in the input given to cnn layers inputs_padding = get_padding(inputs, self._pad_sym, dtype=encoder_inputs.dtype) padding_mask = tf.expand_dims(1 - inputs_padding, 2) encoder_inputs *= padding_mask outputs, outputs_b, final_state = self._call( encoder_inputs, padding_mask) return { 'outputs': outputs, 'outputs_b': outputs_b, 'inputs_attention_bias_cs2s': inputs_attention_bias, 'state': final_state, 'src_lengths': source_length, # should it include paddings or not? 'embedding_softmax_layer': self.embedding_softmax_layer, 'encoder_input': inputs }
def _decode(self, input_dict): targets = input_dict['target_tensors'][0] \ if 'target_tensors' in input_dict else None encoder_outputs = input_dict['encoder_output']['outputs'] encoder_outputs_b = input_dict['encoder_output'].get( 'outputs_b', encoder_outputs) inputs_attention_bias = input_dict['encoder_output'].get( 'inputs_attention_bias_cs2s', None) with tf.name_scope("decode"): # prepare decoder layers if len(self.layers) == 0: knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] normalization_type = self.params.get("normalization_type", "weight_norm") conv_activation = self.params.get("conv_activation", gated_linear_units) # preparing embedding layers with tf.variable_scope("embedding"): if 'embedding_softmax_layer' in input_dict['encoder_output'] \ and self.params['shared_embed']: self.embedding_softmax_layer = \ input_dict['encoder_output']['embedding_softmax_layer'] else: self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( vocab_size=self._tgt_vocab_size, hidden_size=self._tgt_emb_size, pad_vocab_to_eight=self._pad2eight, init_var=0.1, embed_scale=False, pad_sym=self._pad_sym, mask_paddings=True) if self.params.get("pos_embed", True): with tf.variable_scope("pos_embedding"): if 'position_embedding_layer' in input_dict['encoder_output'] \ and self.params['shared_embed']: self.position_embedding_layer = \ input_dict['encoder_output']['position_embedding_layer'] else: self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), hidden_size=self._tgt_emb_size, pad_vocab_to_eight=self._pad2eight, init_var=0.1, embed_scale=False, pad_sym=self._pad_sym, mask_paddings=True) else: self.position_embedding_layer = None # linear projection before cnn layers self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( self._tgt_emb_size, knum_list[0], dropout=self.params["embedding_dropout_keep_prob"], var_scope_name="linear_mapping_before_cnn_layers", mode=self.mode, normalization_type=normalization_type)) for i in range(self.params['decoder_layers']): in_dim = knum_list[i] if i == 0 else knum_list[i - 1] out_dim = knum_list[i] # linear projection is needed for residual connections if # input and output of a cnn layer do not match if in_dim != out_dim: linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( in_dim, out_dim, var_scope_name="linear_mapping_cnn_" + str(i + 1), dropout=1.0, mode=self.mode, normalization_type=normalization_type) else: linear_proj = None conv_layer = conv_wn_layer.Conv1DNetworkNormalized( in_dim, out_dim, kernel_width=kwidth_list[i], mode=self.mode, layer_id=i + 1, hidden_dropout=self.params["hidden_dropout_keep_prob"], conv_padding="VALID", decode_padding=True, activation=conv_activation, normalization_type=normalization_type) att_layer = attention_wn_layer.AttentionLayerNormalized( out_dim, embed_size=self._tgt_emb_size, layer_id=i + 1, add_res=True, mode=self.mode) self.layers.append([linear_proj, conv_layer, att_layer]) # linear projection after cnn layers self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( knum_list[self.params['decoder_layers'] - 1], self.params.get("out_emb_size", self._tgt_emb_size), dropout=1.0, var_scope_name="linear_mapping_after_cnn_layers", mode=self.mode, normalization_type=normalization_type)) if not self.params['shared_embed']: self.layers.append( ffn_wn_layer.FeedFowardNetworkNormalized( self.params.get("out_emb_size", self._tgt_emb_size), self._tgt_vocab_size, dropout=self.params["out_dropout_keep_prob"], var_scope_name="linear_mapping_to_vocabspace", mode=self.mode, normalization_type=normalization_type)) else: # if embedding is shared, # the shared embedding is used as the final linear projection to vocab space self.layers.append(None) if targets is None: return self.predict(encoder_outputs, encoder_outputs_b, inputs_attention_bias) else: logits = self.decode_pass(targets, encoder_outputs, encoder_outputs_b, inputs_attention_bias) return { "logits": logits, "outputs": [tf.argmax(logits, axis=-1)], "final_state": None, "final_sequence_lengths": None }