def BuildNetwork(self, learningRate): self.dataInput = tensorflow.placeholder( dtype=tensorflow.float32, shape=[self.batchSize, 1000, 40], name='dataInput') self.dataSeqInput = tensorflow.placeholder( dtype=tensorflow.int32, shape=[self.batchSize], name='dataSeqInput') self.labelInput = tensorflow.placeholder( dtype=tensorflow.int32, shape=[self.batchSize, None], name='labelInput') self.labelSeqInput = tensorflow.placeholder( dtype=tensorflow.int32, shape=[self.batchSize], name='labelSeqInput') self.parameters['EmbeddingDictionary'] = tensorflow.Variable( initial_value=tensorflow.truncated_normal([50, 2 * self.hiddenNoduleNumbers]), dtype=tensorflow.float32, name='EmbeddingDictionary') with tensorflow.name_scope('Encoder'): self.parameters['Layer1st_Conv'] = tensorflow.layers.conv2d( inputs=self.dataInput[:, :, :, tensorflow.newaxis], filters=8, kernel_size=[3, 3], strides=[1, 1], padding='SAME', name='Layer1st_Conv') self.parameters['Layer1st_MaxPooling'] = tensorflow.layers.max_pooling2d( inputs=self.parameters['Layer1st_Conv'], pool_size=[3, 3], strides=[2, 1], padding='SAME', name='Layer1st_MaxPooling') self.parameters['Layer2nd_Conv'] = tensorflow.layers.conv2d( inputs=self.parameters['Layer1st_MaxPooling'], filters=16, kernel_size=[3, 3], strides=[1, 1], padding='SAME', name='Layer2nd_Conv') self.parameters['Layer2nd_MaxPooling'] = tensorflow.layers.max_pooling2d( inputs=self.parameters['Layer2nd_Conv'], pool_size=[3, 3], strides=[2, 1], padding='SAME', name='Layer2nd_MaxPooling') self.parameters['Layer3rd_Conv'] = tensorflow.layers.conv2d( inputs=self.parameters['Layer2nd_MaxPooling'], filters=16, kernel_size=[3, 3], strides=[1, 1], padding='SAME', name='Layer3rd_Conv') ############################################################################### self.parameters['AttentionList'] = CNN_StandardAttention_Initializer( inputData=self.parameters['Layer3rd_Conv'], inputSeq=self.dataSeqInput, attentionScope=None, hiddenNoduleNumber=16, scopeName='CSA') self.parameters['AttentionResult'] = self.parameters['AttentionList']['FinalResult'] ############################################################################### self.parameters['DecoderInitialState_C'] = tensorflow.layers.dense( inputs=self.parameters['AttentionResult'], units=2 * self.hiddenNoduleNumbers, activation=None, name='DecoderInitialState_C') self.parameters['DecoderInitialState_H'] = tensorflow.layers.dense( inputs=self.parameters['AttentionResult'], units=2 * self.hiddenNoduleNumbers, activation=None, name='DecoderInitialState_H') self.parameters['DecoderInitialState'] = rnn.LSTMStateTuple( c=self.parameters['DecoderInitialState_C'], h=self.parameters['DecoderInitialState_H']) ############################################################################### self.parameters['Helper'] = seq2seq.GreedyEmbeddingHelper( embedding=self.parameters['EmbeddingDictionary'], start_tokens=tensorflow.ones(self.batchSize, dtype=tensorflow.int32) * 40, end_token=0) self.parameters['Decoder_Cell'] = rnn.LSTMCell(num_units=2 * self.hiddenNoduleNumbers) self.parameters['Decoder'] = seq2seq.BasicDecoder( cell=self.parameters['Decoder_Cell'], helper=self.parameters['Helper'], initial_state=self.parameters['DecoderInitialState']) self.parameters['DecoderOutput'], self.parameters['DecoderFinalState'], self.parameters['DecoderSeqLen'] = \ seq2seq.dynamic_decode(decoder=self.parameters['Decoder'], output_time_major=False, maximum_iterations=tensorflow.reduce_max(self.labelSeqInput)) self.parameters['Logits'] = tensorflow.layers.dense( inputs=self.parameters['DecoderOutput'][0], units=50, activation=None, name='Logits') # self.parameters['Mask'] = tensorflow.to_float(tensorflow.not_equal(self.labelInput, 0)) self.parameters['Loss'] = tensorflow.reduce_mean(tensorflow.nn.softmax_cross_entropy_with_logits_v2( labels=tensorflow.one_hot(self.labelInput, depth=50, dtype=tensorflow.float32), logits=self.parameters['Logits']), name='Loss') self.train = tensorflow.train.AdamOptimizer(learning_rate=learningRate).minimize(self.parameters['Loss'])
def decoder(self, memory): """ Implementation of the Tacotron decoder network. Arguments: memory (tf.Tensor): The output states of the encoder RNN concatenated over time. Its shape is expected to be shape=(B, T_sent, 2 * encoder.n_gru_units) with B being the batch size, T_sent being the number of tokens in the sentence including the EOS token. Returns: tf.tensor: Generated reduced Mel. spectrogram. The shape is shape=(B, T_spec // r, n_mels * r), with B being the batch size, T_spec being the number of frames in the spectrogram and r being the reduction factor. """ with tf.variable_scope('decoder2'): # Query the current batch size. batch_size = tf.shape(memory)[0] # Query the number of layers for the decoder RNN. n_decoder_layers = self.hparams.decoder.n_gru_layers # Query the number of units for the decoder cells. n_decoder_units = self.hparams.decoder.n_decoder_gru_units # Query the number of units for the attention cell. n_attention_units = self.hparams.decoder.n_attention_units # General attention mechanism parameters that are the same for all mechanisms. mechanism_params = { 'num_units': n_attention_units, 'memory': memory, } if model_params.attention.mechanism == LocalLuongAttention: # Update the parameters with additional parameters for the local attention case. mechanism_params.update({ 'attention_mode': model_params.attention.luong_local_mode, 'score_mode': model_params.attention.luong_local_score, 'd': model_params.attention.luong_local_window_D, 'force_gaussian': model_params.attention.luong_force_gaussian, 'const_batch_size': 16 }) # Create the attention mechanism. attention_mechanism = model_params.attention.mechanism( **mechanism_params) # Create the attention RNN cell. if model_params.force_cudnn: attention_cell = tfcrnn.CudnnCompatibleGRUCell( num_units=n_attention_units) else: attention_cell = tf.nn.rnn_cell.GRUCell( num_units=n_attention_units) # Apply the pre-net to each decoder input as show in [1], figure 1. attention_cell = PrenetWrapper(attention_cell, self.hparams.decoder.pre_net_layers, self.is_training()) # Select the attention wrapper needed for the current attention mechanism. if model_params.attention.mechanism == LocalLuongAttention: wrapper = AdvancedAttentionWrapper else: wrapper = tfc.seq2seq.AttentionWrapper # Connect the attention cell with the attention mechanism. wrapped_attention_cell = wrapper( cell=attention_cell, attention_mechanism=attention_mechanism, attention_layer_size=n_attention_units, alignment_history=True, output_attention=True, initial_cell_state=None ) # => (B, T_sent, n_attention_units) = (B, T_sent, 256) # Stack several GRU cells and apply a residual connection after each cell. # Before the input reaches the decoder RNN it passes through the attention cell. cells = [wrapped_attention_cell] for i in range(n_decoder_layers): # Create a decoder GRU cell. if model_params.force_cudnn: # => (B, T_spec, n_decoder_units) = (B, T_spec, 256) cell = tfcrnn.CudnnCompatibleGRUCell( num_units=n_decoder_units) else: # => (B, T_spec, n_decoder_units) = (B, T_spec, 256) cell = tf.nn.rnn_cell.GRUCell(num_units=n_decoder_units) # => (B, T_spec, n_decoder_units) = (B, T_spec, 256) cell = tf.nn.rnn_cell.ResidualWrapper(cell) cells.append(cell) # => (B, T_spec, n_decoder_units) = (B, T_spec, 256) decoder_cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True) # Project the final cells output to the decoder target size. # => (B, T_spec, target_size * reduction) = (B, T_spec, 80 * reduction) output_cell = tfc.rnn.OutputProjectionWrapper( cell=decoder_cell, output_size=self.hparams.decoder.target_size * self.hparams.reduction, # activation=tf.nn.sigmoid ) decoder_initial_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) if self.is_training(): # During training we do not stop decoding manually. The decoder automatically # decodes as many time steps as are contained in the ground truth data. maximum_iterations = None # Unfold the reduced spectrogram in order to grab the r'th ground truth frames. mel_targets = tf.reshape(self.inp_mel_spec, [batch_size, -1, self.hparams.n_mels]) # Create a custom training helper for feeding ground truth frames during training. helper = TacotronTrainingHelper( batch_size=batch_size, outputs=mel_targets, input_size=self.hparams.decoder.target_size, reduction_factor=self.hparams.reduction, ) elif self._mode == Mode.EVAL: # During evaluation we stop decoding after the same number of frames the ground # truth has. maximum_iterations = tf.shape(self.inp_mel_spec)[1] # Create a custom inference helper that handles proper evaluation data feeding. helper = TacotronInferenceHelper( batch_size=batch_size, input_size=self.hparams.decoder.target_size) else: # During inference we stop decoding after `maximum_iterations` frames. maximum_iterations = self.hparams.decoder.maximum_iterations // self.hparams.reduction # Create a custom inference helper that handles proper inference data feeding. helper = TacotronInferenceHelper( batch_size=batch_size, input_size=self.hparams.decoder.target_size) decoder = seq2seq.BasicDecoder(cell=output_cell, helper=helper, initial_state=decoder_initial_state) # Start decoding. decoder_outputs, final_state, final_sequence_lengths = seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False, maximum_iterations=maximum_iterations) # decoder_outputs => type=BasicDecoderOutput, (rnn_output, _) # final_state => type=AttentionWrapperState, (attention_wrapper_state, _, _) # final_sequence_lengths.shape = (B) # Create an attention alignment summary image. self.alignment_history = final_state[0].alignment_history.stack() # shape => (B, T_spec // r, n_mels * r) return decoder_outputs.rnn_output
def init_decoder_variable(self): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) # Initialize decoder embeddings to have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) # output_layer=None) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.start_token end_token = self.end_token def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer, ) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, # impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def create_model(model, labels, decoder_inputs, batch_size, model_type="decode", sep_positions=None): """Creates a classification model. Args: model: the BERT model from modeling.py labels: ground truth paragraph order decoder_inputs: the input to the decoder if used batch_size: the batch size model_type: one of decode, pooled, attn sep_positions: (optional) for "pooled" indecies of SEP tokens Returns: tuple of (loss, per_example_loss, logits, probabilities) for model """ output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value tpu_batch_size = tf.shape(output_layer)[0] num_labels = 5 # GOOGLE-INTERNAL TODO(daniter) this shouldn't be hardcoded with tf.variable_scope("paragraph_reconstruct"): if model_type == "decode": lstm_cell = tf.nn.rnn_cell.LSTMCell( num_units=hidden_size, use_peepholes=True, state_is_tuple=True) def sample_fn(x): return tf.to_float(tf.reshape(tf.argmax(x, axis=-1), (-1, 1))) helper = FixedSizeInferenceHelper( sample_fn=sample_fn, sample_shape=[1], sample_dtype=tf.float32, start_inputs=decoder_inputs[:, 0], end_fn=None) # Decoder project_layer = tf.layers.Dense( num_labels, use_bias=False, name="output_projection") my_decoder = contrib_seq2seq.BasicDecoder( lstm_cell, helper, tf.nn.rnn_cell.LSTMStateTuple(output_layer, output_layer), output_layer=project_layer) # Dynamic decoding outputs, _, _ = contrib_seq2seq.dynamic_decode( my_decoder, swap_memory=True, scope="paragraph_reconstruct", maximum_iterations=5) logits = outputs.rnn_output cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) per_example_loss = cross_ent loss = tf.reduce_sum(cross_ent) / tf.to_float(batch_size) probabilities = tf.nn.softmax(logits, axis=-1) # GOOGLE-INTERAL: TODO(daniter) currently neither of these actually train elif model_type == "pooled": token_embeddings = model.get_sequence_output() # sep positions come out batch by batch so we need to add the batch index # we do that explicitly here since we don't know the batch size in the # record decoder batch_idx = tf.range(tpu_batch_size) batch_idx = tf.reshape(batch_idx, [tpu_batch_size, 1]) batch_idx = tf.tile(batch_idx, [1, 5]) # double check batch_idx = tf.reshape(batch_idx, [tpu_batch_size, 5, 1]) # batch_idx = tf.Print(batch_idx, [batch_idx], # message="batch_idx", summarize=999999) sep_positions = tf.concat([batch_idx, sep_positions], axis=2) # sep_positions = tf.Print(sep_positions, [sep_positions], # message="sep_positions", summarize=999999) sep_vecs = tf.gather_nd(token_embeddings, sep_positions) sep_vecs = tf.reshape(sep_vecs, [tpu_batch_size, 5, hidden_size]) # sep_vecs = tf.Print(sep_vecs, [sep_vecs], message="sep_vecs", # summarize=999999) logits = tf.layers.dense( inputs=sep_vecs, units=num_labels, name="output_projection") # logits = tf.Print(logits, [logits], message="logits", summarize=999999) cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) per_example_loss = cross_ent loss = tf.reduce_sum(cross_ent) / tf.to_float(batch_size) probabilities = tf.nn.softmax(logits, axis=-1) elif model_type == "attn": # change size to match sequence embedding size input_consts = tf.constant([0, 1, 2, 3, 4]) position_encoding = tf.broadcast_to(input_consts, [tpu_batch_size, 5]) # position_encoding = tf.to_float( # tf.reshape(position_encoding, (-1, 5, 1))) token_type_table = tf.get_variable( name="attention_embedding", shape=[5, 512], # don't hardcode initializer=tf.truncated_normal_initializer(stddev=0.02)) # This vocab will be small so we always do one-hot here, since it is # always faster for a small vocabulary. flat_token_type_ids = tf.reshape(position_encoding, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=5) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [tpu_batch_size, 5, 512]) token_embeddings = model.get_sequence_output() attn = modeling.attention_layer(token_type_embeddings, token_embeddings) attn = tf.reshape(attn, (-1, 5, 512)) # head size logits = tf.layers.dense( inputs=attn, units=num_labels, name="output_projection") cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) per_example_loss = cross_ent loss = tf.reduce_sum(cross_ent) / tf.to_float(batch_size) probabilities = tf.nn.softmax(logits, axis=-1) return (loss, per_example_loss, logits, probabilities)
def _build_decoder_action(model, dialogue_state, hparams, start_token, end_token, output_layer): """build the decoder for action states.""" iterator = model.iterator start_token_id = tf.cast( model.vocab_table.lookup(tf.constant(start_token)), tf.int32) end_token_id = tf.cast( model.vocab_table.lookup(tf.constant(end_token)), tf.int32) start_tokens = tf.fill([model.batch_size], start_token_id) end_token = end_token_id # kb is not used again ## Decoder. with tf.variable_scope("action_decoder") as decoder_scope: # we initialize the cell with the last layer of the last hidden state cell, decoder_initial_state = _build_action_decoder_cell( model, hparams, dialogue_state, model.global_gpu_num) model.global_gpu_num += 1 ## Train or eval # situation one, for train, eval, mutable train # decoder_emp_inp: [max_time, batch_size, num_units] action = iterator.action # shift action paddings = tf.constant([[0, 0], [1, 0]]) action = tf.pad(action, paddings, "CONSTANT", constant_values=0)[:, :-1] decoder_emb_inp = tf.nn.embedding_lookup(model.embedding_decoder, action) # Helper helper_train = seq2seq.TrainingHelper( decoder_emb_inp, iterator.action_len, time_major=False) # Decoder my_decoder_train = seq2seq.BasicDecoder( cell, helper_train, decoder_initial_state, output_layer) # Dynamic decoding outputs_train, _, _ = seq2seq.dynamic_decode( my_decoder_train, output_time_major=False, swap_memory=True, scope=decoder_scope) sample_id_train = outputs_train.sample_id logits_train = outputs_train.rnn_output # inference beam_width = hparams.beam_width length_penalty_weight = hparams.length_penalty_weight if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0: my_decoder_infer = seq2seq.BeamSearchDecoder( cell=cell, embedding=model.embedding_decoder, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=beam_width, output_layer=output_layer, length_penalty_weight=length_penalty_weight) else: # Helper if model.mode in dialogue_utils.self_play_modes: helper_infer = seq2seq.SampleEmbeddingHelper( model.embedding_decoder, start_tokens, end_token) else: helper_infer = seq2seq.GreedyEmbeddingHelper( model.embedding_decoder, start_tokens, end_token) # Decoder my_decoder_infer = seq2seq.BasicDecoder( cell, helper_infer, decoder_initial_state, output_layer=output_layer # applied per timestep ) # Dynamic decoding outputs_infer, _, _ = seq2seq.dynamic_decode( my_decoder_infer, maximum_iterations=hparams.len_action, output_time_major=False, swap_memory=True, scope=decoder_scope) if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0: logits_infer = tf.no_op() sample_id_infer = outputs_infer.predicted_ids else: logits_infer = outputs_infer.rnn_output sample_id_infer = outputs_infer.sample_id return logits_train, logits_infer, sample_id_train, sample_id_infer
def build_decoder(self, encoder_outputs, encoder_state): """ 构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) #构建解码器的embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( #根据预测值或者真实值得到下一刻的输入 inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state #output_layer = self.decoder_output_projection #输出映射层,将rnn_size转化为vocab_size维 ) #decoder在当前的batch下的最大time_steps max_decoder_length = tf.reduce_max(self.decoder_inputs_length) outputs, self.final_state, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished= True, #Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。 maximum_iterations= max_decoder_length, #最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生<eos>或者到达最大步数处停止 parallel_iterations=self. parallel_iterations, #parallel_iterations是并行执行循环的个数 swap_memory=True, scope=decoder_scope) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) self.masks = tf.sequence_mask( #构建序列长度的mask标志 lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, #真实值y logits=decoder_logits_train #预测值y_ ) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits= decoder_logits_train, #[batch_size, sequence_length, num_decoder_symbols] targets=self. decoder_inputs, #[batch_size, sequence_length] 不用做one_hot weights=self. masks_rewards, #[batch_size, sequence_length] 即mask,滤去padding的loss计算,使loss计算更准确。 average_across_timesteps=True, average_across_batch=True) self.loss = seq2seq.sequence_loss( #序列的损失函数 logits= decoder_logits_train, #[batch_size, sequence_length, num_decoder_symbols] targets=self. decoder_inputs, #[batch_size, sequence_length] 不用做one_hot weights=self.masks, # 即mask,滤去padding的loss计算,使loss计算更准确。 average_across_timesteps=True, average_across_batch=True) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection) if self.max_decode_step is not None: max_decoder_step = self.max_decode_step else: max_decoder_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) self.decoder_outputs_decode, self.final_state, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, maximum_iterations=max_decoder_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def decode(self, encoder_outputs, encoder_state, source_sequence_length): with tf.variable_scope("Decoder") as scope: beam_width = self.beam_width decoder_type = self.decoder_type seq_max_len = self.seq_max_len batch_size = tf.shape(encoder_outputs)[0] if self.path_embed_method == "lstm": self.decoder_cell = self._build_decode_cell() if self.decoder_type == "beam" and self.beam_width > 0: beam_memory = seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_width) beam_source_sequence_length = seq2seq.tile_batch( source_sequence_length, multiplier=self.beam_width) beam_encoder_state = seq2seq.tile_batch( encoder_state, multiplier=self.beam_width) beam_batch_size = batch_size * self.beam_width attention_mechanism = seq2seq.BahdanauAttention( self.hidden_layer_dim, beam_memory, memory_sequence_length=beam_source_sequence_length) self.beam_decoder_cell = seq2seq.AttentionWrapper( self.decoder_cell, attention_mechanism, attention_layer_size=self.hidden_layer_dim) self.beam_decoder_initial_state = self.beam_decoder_cell.zero_state( beam_batch_size, tf.float32).clone(cell_state=beam_encoder_state) memory = encoder_outputs source_sequence_length = source_sequence_length encoder_state = encoder_state attention_mechanism = seq2seq.BahdanauAttention( self.hidden_layer_dim, memory, memory_sequence_length=source_sequence_length) self.decoder_cell = seq2seq.AttentionWrapper( self.decoder_cell, attention_mechanism, attention_layer_size=self.hidden_layer_dim) self.decoder_initial_state = self.decoder_cell.zero_state( batch_size, tf.float32).clone(cell_state=encoder_state) projection_layer = Dense(self.word_vocab_size, use_bias=False) """For training the model""" if self.mode == "train": decoder_train_helper = tf.contrib.seq2seq.TrainingHelper( self.decoder_train_inputs_embedded, self.decoder_train_length) decoder_train = seq2seq.BasicDecoder( self.decoder_cell, decoder_train_helper, self.decoder_initial_state, projection_layer) decoder_outputs_train, decoder_states_train, decoder_seq_len_train = seq2seq.dynamic_decode( decoder_train) decoder_logits_train = decoder_outputs_train.rnn_output self.decoder_logits_train = tf.reshape( decoder_logits_train, [batch_size, -1, self.word_vocab_size]) """For test the model""" # if self.mode == "infer" or self.if_pred_on_dev: if decoder_type == "greedy": decoder_infer_helper = seq2seq.GreedyEmbeddingHelper( self.word_embeddings, tf.ones([batch_size], dtype=tf.int32), self.EOS) decoder_infer = seq2seq.BasicDecoder( self.decoder_cell, decoder_infer_helper, self.decoder_initial_state, projection_layer) elif decoder_type == "beam": decoder_infer = seq2seq.BeamSearchDecoder( cell=self.beam_decoder_cell, embedding=self.word_embeddings, start_tokens=tf.ones([batch_size], dtype=tf.int32), end_token=self.EOS, initial_state=self.beam_decoder_initial_state, beam_width=beam_width, output_layer=projection_layer) decoder_outputs_infer, decoder_states_infer, decoder_seq_len_infer = seq2seq.dynamic_decode( decoder_infer, maximum_iterations=seq_max_len) if decoder_type == "beam": self.decoder_logits_infer = tf.no_op() self.sample_id = decoder_outputs_infer.predicted_ids elif decoder_type == "greedy": self.decoder_logits_infer = decoder_outputs_infer.rnn_output self.sample_id = decoder_outputs_infer.sample_id
def _build_main_graph(self, xs, xlens, ys, ylens): with tf.variable_scope('word_model', reuse=self._reuse_vars): embeds = self._variable( 'embeddings', dtype=tf.float32, shape=[self._word_symbols, self._word_embedding_size]) with tf.variable_scope('encoder', reuse=self._reuse_vars): fw_cells = self._rnn_cells(self._word_model_rnn_hidden_size, self._word_model_rnn_layers // 2) bw_cells = self._rnn_cells(self._word_model_rnn_hidden_size, self._word_model_rnn_layers // 2) batch_input_embeds = tf.nn.embedding_lookup(embeds, xs) rnn_out, rnn_state = tf.nn.bidirectional_dynamic_rnn( fw_cells, bw_cells, batch_input_embeds, xlens, dtype=tf.float32) with tf.variable_scope('decoder', reuse=self._reuse_vars): # Attention only consumes encoder outputs. attention = seq2seq.LuongAttention( self._decoder_attention_size, tf.concat(rnn_out, -1), xlens) cells = self._rnn_cells(self._word_model_rnn_hidden_size, self._word_model_rnn_layers) cells = seq2seq.AttentionWrapper(cells, attention) decode_init_state = cells.zero_state(self._batch_size, tf.float32) # This layer sits just before softmax. It seems that if an activation is placed here, # the network will not converge well. Why? def apply_dropout(v): if self._mode == 'train': return tf.nn.dropout(v, KEEP) else: return v final_projection = tf.layers.Dense( self._word_symbols, kernel_regularizer=apply_dropout, use_bias=False) if self._mode != 'infer': batch_target_embeds = tf.nn.embedding_lookup(embeds, ys) helper = seq2seq.TrainingHelper(batch_target_embeds, ylens) decoder = seq2seq.BasicDecoder(cells, helper, decode_init_state, final_projection) (logits, ids), state, lengths = seq2seq.dynamic_decode(decoder) return logits, ids, lengths else: helper = seq2seq.GreedyEmbeddingHelper( embeds, tf.tile([self._start_token], [self._batch_size]), self._end_token) decoder = seq2seq.BasicDecoder(cells, helper, decode_init_state, final_projection) max_iters = tf.reduce_max(xlens) * 2 (logits, ids), state, lengths = seq2seq.dynamic_decode( decoder, maximum_iterations=max_iters) return logits, ids, lengths
def build_decoder(self): with tf.variable_scope("Decoder"): with tf.name_scope("Decoder_cell"): # 创建单个cell def get_a_cell(lstm_size, keep_prob): lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size) drop = tf.nn.rnn_cell.DropoutWrapper( lstm, output_keep_prob=keep_prob) return drop # 堆叠多层神经元 cell = tf.nn.rnn_cell.MultiRNNCell([ get_a_cell(self.lstm_size, self.keep_prob) for _ in range(self.n_layers) ]) with tf.name_scope("Decoder_Dense"): #Output全连接层 output_layer = tf.layers.Dense( units=self.data.decoder_vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) with tf.variable_scope("decoder"): # 创建helper对象 training_helper = tcs.TrainingHelper( inputs=self.decoder_input_embedding, sequence_length=self.decoder_target_sequence_length, time_major=False) # 构造decoder training_decoder = tcs.BasicDecoder( cell=cell, helper=training_helper, initial_state=self.encoder_final_state, output_layer=output_layer) self.training_decoder_output, \ self.training_decoder_final_state, \ self.training_decoder_final_sequence_lengths = tcs.dynamic_decode(decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.decoder_max_target_sequence_length) # prediction with tf.variable_scope("decoder", reuse=True): # 创建一个常量tensor并复制为batch_size的大小 start_tokens = tf.tile( tf.constant([self.data.decoder_word_to_int['<GO>']], dtype=tf.int32), [self.batch_size], name='start_tokens') predicting_helper = tcs.GreedyEmbeddingHelper( embedding=self.decoder_embedding, start_tokens=start_tokens, end_token=self.data.decoder_word_to_int['<EOS>']) predicting_decoder = tcs.BasicDecoder( cell=cell, helper=predicting_helper, initial_state=self.encoder_final_state, output_layer=output_layer) self.predicting_decoder_output, \ self.predicting_decoder_final_state, \ self.predicting_decoder_final_sequence_lengths = tcs.dynamic_decode(decoder=predicting_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.decoder_max_target_sequence_length)
def build_decoder(self, encoder_outputs, encoder_state): """ Build the decoder: multi-layers of LSTMs with global attention mechanism. """ sos_id_2 = tf.cast(self.dict_lab2idx_tgt[self.SOS], tf.int32) eos_id_2 = tf.cast(self.dict_lab2idx_tgt[self.EOS], tf.int32) self.output_layer = Dense(self.vocab_size_tgt, name='output_projection') # Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = self.build_decoder_cell( encoder_outputs, encoder_state, self.seq_len_src) # Train if self.mode != 'INFER': # tf helper for embedding decoder for training (feed back target and not predicted value) helper = s2s.TrainingHelper(self.word_embeddings_tgt, self.seq_len_tgt) # decoder cell decoder_cell = s2s.BasicDecoder(cell, helper, decoder_initial_state, output_layer=self.output_layer) # Dynamic decoding outputs, final_context_state, _ = s2s.dynamic_decode( decoder_cell, maximum_iterations=self.maximum_iterations, swap_memory=False, impute_finished=True, scope=decoder_scope) # Ouputs of decoding sample_id = outputs.sample_id logits = outputs.rnn_output else: start_tokens = tf.fill([self.batch_size], sos_id_2) end_token = eos_id_2 # tf helper for embedding decoder for inference (feed back predicted value and not target like in training) # NOTE: there must be a bug in the tf helper or in the way I am feeding the inputs as the training gets very high accuracy (close to perfect translation) (with TrainingHelper) but the performance drops with GreedyEmbeddingHelper (at inference time) with inaccurate translations.. My guess is that there is a shift in the targets somewhere and that TrainingHelper feeds to the decoder at time t the target t, so the decoder is learning identity. But then GreedyEmbeddingHelper does not do the same because it feeds the predicted last target, hence since the NN learned identity it repeats the same word like if it did not learn anything. # This is essentially the reason why I was not able to move further and finish training CoVe vectors and then reproduce the results. helper = s2s.GreedyEmbeddingHelper(self.embedding_tgt, start_tokens, end_token) decoder_cell = s2s.BasicDecoder(cell, helper, decoder_initial_state, output_layer=self.output_layer) # Dynamic decoding outputs, final_context_state, _ = s2s.dynamic_decode( decoder_cell, maximum_iterations=self.maximum_iterations, impute_finished=False, swap_memory=False, scope=decoder_scope) logits = outputs.rnn_output sample_id = outputs.sample_id self.logits = logits self.sample_id = sample_id return logits, sample_id, final_context_state
def _model(self): graph = tf.Graph() with graph.as_default(): embedding = tf.Variable(np.zeros( shape=[self.num_words, self.embedding_size], dtype=np.float32), trainable=False, name='embedding') # 词向量 lr = tf.placeholder(tf.float32, [], name='learning_rate') # 输入数据 x_input = tf.placeholder(tf.int32, [None, None], name='x_input') # 输入数据X x_sequence_length = tf.placeholder(tf.int32, [None], name='x_length') # 输入数据每一条的长度 x_embedding = tf.nn.embedding_lookup(embedding, x_input) # 将输入的one-hot编码转换成向量 y_input = tf.placeholder(tf.int32, [None, None], name='y_input') # 输入数据Y y_sequence_length = tf.placeholder(tf.int32, [None], name='y_length') # 每一个Y的长度 y_embedding = tf.nn.embedding_lookup(embedding, y_input) # 对Y向量化 batch_size = tf.placeholder(tf.int32, [], name='batch_size') # batch_size = tf.shape(x_input)[0] # 使用gru代替LSTM, 4层cell堆叠 encoder_cell = rnn.MultiRNNCell( [rnn.GRUCell(128, activation=tf.tanh) for _ in range(4)]) decoder_cell = rnn.MultiRNNCell( [rnn.GRUCell(128, activation=tf.tanh) for _ in range(4)]) # 计算encoder output, encoder_state = tf.nn.dynamic_rnn( cell=encoder_cell, inputs=x_embedding, initial_state=encoder_cell.zero_state(batch_size, tf.float32), sequence_length=x_sequence_length) attention_mechanism = seq2seq.BahdanauAttention( 64, output, x_sequence_length) attention_cell = seq2seq.AttentionWrapper(decoder_cell, attention_mechanism) decoder_cell = rnn.OutputProjectionWrapper(attention_cell, 64, activation=tf.tanh) encoder_state = decoder_cell.zero_state( batch_size, tf.float32).clone(cell_state=encoder_state) output_layer = tf.layers.Dense( self.num_words, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # 定义training decoder training_helper = seq2seq.TrainingHelper( inputs=y_embedding, sequence_length=y_sequence_length) training_decoder = seq2seq.BasicDecoder( decoder_cell, training_helper, encoder_state, output_layer) # impute_finish 标记为True时,序列读入<eos>后不再进行计算,保持state不变并且输出全0 training_output, _, _ = seq2seq.dynamic_decode( training_decoder, # 加上<GO>和<EOS> maximum_iterations=self.max_sentence_length + 2, impute_finished=True) # predict decoder predict_helper = seq2seq.GreedyEmbeddingHelper( embedding, tf.fill([batch_size], self.word2index['GO']), self.word2index['EOS']) predict_decoder = seq2seq.BasicDecoder(decoder_cell, predict_helper, encoder_state, output_layer) predict_output, _, _ = seq2seq.dynamic_decode( predict_decoder, maximum_iterations=self.max_sentence_length + 2, impute_finished=True) # loss function training_logits = tf.identity(training_output.rnn_output, name='training_logits') predicting_logits = tf.identity(predict_output.rnn_output, name='predicting') masks = tf.sequence_mask(y_sequence_length, dtype=tf.float32, name='mask') with tf.variable_scope('optimization'): loss = seq2seq.sequence_loss(training_logits, y_input, masks) optimizer = tf.train.AdamOptimizer(lr) gradients = optimizer.compute_gradients(loss) capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] train_op = optimizer.apply_gradients(capped_gradients) return graph, loss, train_op, predicting_logits
def BuildNetwork(self, learningRate): ############################################################################# # Input Data ############################################################################# self.dataInput = tensorflow.placeholder( dtype=tensorflow.float32, shape=[None, None, self.featureShape], name='dataInput') self.labelInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None, None], name='labelInput') self.dataLenInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='dataLenInput') self.labelLenInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='labelLenInput') ############################################################################# # Batch Parameters ############################################################################# self.parameters['BatchSize'], self.parameters[ 'TimeStep'], _ = tensorflow.unstack( tensorflow.shape(input=self.dataInput, name='DataShape')) self.parameters['LabelStep'] = tensorflow.shape(input=self.labelInput, name='LabelShape')[1] ################################################################################################### # Encoder ################################################################################################### with tensorflow.variable_scope('Encoder'): self.parameters[ 'Encoder_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters[ 'Encoder_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Encoder_Output'], self.parameters['Encoder_FinalState'] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['Encoder_Cell_Forward'], cell_bw=self.parameters['Encoder_Cell_Backward'], inputs=self.dataInput, sequence_length=self.dataLenInput, dtype=tensorflow.float32) if self.attention is None: self.parameters['Decoder_InitalState'] = [] for index in range(self.rnnLayers): self.parameters[ 'Encoder_Cell_Layer%d' % index] = rnn.LSTMStateTuple( c=tensorflow.concat([ self.parameters['Encoder_FinalState'][index][0].c, self.parameters['Encoder_FinalState'][index][1].c ], axis=1), h=tensorflow.concat([ self.parameters['Encoder_FinalState'][index][0].h, self.parameters['Encoder_FinalState'][index][1].h ], axis=1)) self.parameters['Decoder_InitalState'].append( self.parameters['Encoder_Cell_Layer%d' % index]) self.parameters['Decoder_InitalState'] = tuple( self.parameters['Decoder_InitalState']) else: self.attentionList = self.attention( dataInput=self.parameters['Encoder_Output'], scopeName=self.attentionName, hiddenNoduleNumber=2 * self.hiddenNodules, attentionScope=self.attentionScope, blstmFlag=True) self.parameters['Decoder_InitalState'] = [] for index in range(self.rnnLayers): self.parameters[ 'Encoder_Cell_Layer%d' % index] = rnn.LSTMStateTuple( c=self.attentionList['FinalResult'], h=tensorflow.concat([ self.parameters['Encoder_FinalState'][index][0].h, self.parameters['Encoder_FinalState'][index][1].h ], axis=1)) self.parameters['Decoder_InitalState'].append( self.parameters['Encoder_Cell_Layer%d' % index]) self.parameters['Decoder_InitalState'] = tuple( self.parameters['Decoder_InitalState']) ############################################################################# # Decoder Label Pretreatment ############################################################################# self.parameters['DecoderEmbedding'] = tensorflow.Variable( initial_value=tensorflow.truncated_normal( shape=[VOCABULAR, self.hiddenNodules * 2], stddev=0.1, name='DecoderEmbedding')) self.parameters[ 'DecoderEmbeddingResult'] = tensorflow.nn.embedding_lookup( params=self.parameters['DecoderEmbedding'], ids=self.labelInput, name='DecoderEmbeddingResult') ############################################################################# # Decoder ############################################################################# self.parameters['Decoder_Helper'] = seq2seq.TrainingHelper( inputs=self.parameters['DecoderEmbeddingResult'], sequence_length=self.labelLenInput, name='Decoder_Helper') with tensorflow.variable_scope('Decoder'): self.parameters['Decoder_FC'] = Dense(VOCABULAR) self.parameters[ 'Decoder_Cell'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules * 2) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Decoder'] = seq2seq.BasicDecoder( cell=self.parameters['Decoder_Cell'], helper=self.parameters['Decoder_Helper'], initial_state=self.parameters['Decoder_InitalState'], output_layer=self.parameters['Decoder_FC']) self.parameters['Decoder_Logits'], self.parameters[ 'Decoder_FinalState'], self.parameters[ 'Decoder_FinalSeq'] = seq2seq.dynamic_decode( decoder=self.parameters['Decoder']) with tensorflow.name_scope('Loss'): self.parameters['TargetsReshape'] = tensorflow.reshape( tensor=self.labelInput, shape=[-1], name='TargetsReshape') self.parameters['Decoder_Reshape'] = tensorflow.reshape( self.parameters['Decoder_Logits'].rnn_output, [-1, VOCABULAR], name='Decoder_Reshape') self.parameters[ 'Cost'] = tensorflow.losses.sparse_softmax_cross_entropy( labels=self.parameters['TargetsReshape'], logits=self.parameters['Decoder_Reshape']) self.train = tensorflow.train.AdamOptimizer( learning_rate=learningRate).minimize(self.parameters['Cost'])
def _build_decoder(self): with tf.variable_scope("dialog_decoder"): with tf.variable_scope("decoder_output_projection"): # 全连接层 output_layer = layers_core.Dense( self.config.vocab_size, use_bias=False, name="output_projection") # units单元个数 词表大小 with tf.variable_scope("decoder_rnn"): attn_mech = tc_seq2seq.BahdanauAttention( self.config.dec_hidden_size, self.word_outputs, None) attn_mech1 = tc_seq2seq.BahdanauAttention( self.config.dec_hidden_size, self.uttn_outputs, None) attn_mech2 = tc_seq2seq.BahdanauAttention( self.config.dec_hidden_size, self.encoder_outputs, None) self.att1 = attn_mech.batch_size self.att2 = attn_mech.batch_size self.att3 = attn_mech.batch_size dec_cell = GRUCell(self.config.dec_hidden_size) #dec_cell = grucell_cond.GRUCellCond(self.config.dec_hidden_size) #self.encoder_outputs = tf.reshape(self.encoder_outputs,[-1,self.config.dec_hidden_size*2]) #dec_cell = grucell_cond.CondWrapper(dec_cell, self.encoder_outputs) #word_outputs = tf.reshape(self.word_outputs,[self.batch_size,-1]) dec_cell = EAttentionWrapper( dec_cell, [attn_mech, attn_mech1, attn_mech2], attention_layer_size=[ self.config.dec_hidden_size, self.config.dec_hidden_size, self.config.dec_hidden_size ]) #print('self.batch_size',self.batch_size) dec_init_state = dec_cell.zero_state( batch_size=self.batch_size, dtype=tf.float32) # Training or Eval if self.mode != ModelMode.infer: # not infer, do decode turn by turn resp_emb_inp = tf.nn.embedding_lookup( self.decoder_embeddings, self.target_input) helper = tc_seq2seq.TrainingHelper(resp_emb_inp, self.target_length) decoder = tc_seq2seq.BasicDecoder( cell=dec_cell, helper=helper, initial_state=dec_init_state, # 编码层的最终状态 output_layer=output_layer # 全连接层 ) dec_outputs, dec_state, _ = tc_seq2seq.dynamic_decode( decoder) sample_id = dec_outputs.sample_id logits = dec_outputs.rnn_output else: start_tokens = tf.fill([self.batch_size], self.config.sos_idx) end_token = self.config.eos_idx maximum_iterations = tf.to_int32(self.config.infer_max_len) helper = tc_seq2seq.GreedyEmbeddingHelper( self.decoder_embeddings, start_tokens=start_tokens, end_token=tf.constant(end_token, dtype=tf.int32)) decoder = tc_seq2seq.BasicDecoder( cell=dec_cell, helper=helper, initial_state=dec_init_state, output_layer=output_layer # 全连接层 ) dec_outputs, dec_state, _ = tc_seq2seq.dynamic_decode( decoder, maximum_iterations=maximum_iterations) logits = tf.no_op() sample_id = dec_outputs.sample_id self.logits = logits self.sample_id = sample_id
def get_model(config, embeddings=None, num_words=None, stitch_inputs=None): inputs = dict() outputs = dict() if stitch_inputs is None: inputs['x'] = tf.placeholder(tf.int32, shape=[None, None], name="x") inputs['y'] = tf.placeholder(tf.int32, shape=[None, None], name="y") inputs['seq_length'] = tf.placeholder(tf.int32, shape=[None], name="seq_length") else: inputs['x'] = stitch_inputs['x'] inputs['y'] = stitch_inputs['y'] inputs['seq_length'] = stitch_inputs['seq_length'] if embeddings is None: logging.info('initialize embeddings') embeddings = tf.get_variable( name="embedding", shape=[num_words, config['embedding_size']], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1), trainable=True) else: logging.info('use pretrained embeddings') logging.info('embeddings trainable: {}'.format( config.get('embedding_trainable', False))) embeddings = tf.get_variable( "embeddings", shape=embeddings.shape, initializer=tf.constant_initializer(embeddings), trainable=config.get('embedding_trainable', False)) inputs['input_keep_prob'] = tf.placeholder_with_default( tf.constant(1, dtype=tf.float32), shape=[], name="input_keep_prob") inputs['output_keep_prob'] = tf.placeholder_with_default( tf.constant(1, dtype=tf.float32), shape=[], name="output_keep_prob") inputs['learning_rate'] = tf.placeholder_with_default(tf.constant( config['learning_rate'], dtype=tf.float32), shape=[], name="learning_rate") batch_size = tf.shape(inputs['x'])[0] def create_cell(): rnn_cell_type = config.get('rnn_cell', 'lnlstm') if rnn_cell_type == 'lstm': logging.info('Use LSTMBlockCell cell') _cell = tf.contrib.rnn.LSTMBlockCell(config['rnn_size']) else: logging.info('Use LayerNormBasicLSTMCell cell') _cell = tf.contrib.rnn.LayerNormBasicLSTMCell(config['rnn_size']) _cell = tf.nn.rnn_cell.DropoutWrapper( _cell, input_keep_prob=inputs['input_keep_prob'], output_keep_prob=inputs['output_keep_prob']) return _cell cells = [create_cell() for _ in range(config['num_layers'])] cell = tf.nn.rnn_cell.MultiRNNCell(cells) x_embedded = tf.nn.embedding_lookup(embeddings, inputs['x']) helper = seq2seq.TrainingHelper(x_embedded, inputs['seq_length']) projection_layer = Dense(embeddings.shape[0], name='projection_layer', use_bias=True, dtype=tf.float32) initial_state = cell.zero_state(batch_size, dtype=tf.float32) mask = sequence_mask(inputs['seq_length'], dtype=tf.float32) decoder = seq2seq.BasicDecoder(cell, helper, initial_state=initial_state) decode_output, _, _ = seq2seq.dynamic_decode(decoder, impute_finished=True, swap_memory=config.get( 'swap_memory', False)) if config.get('sampled_softmax', 0) > 0: projection_layer.build(input_shape=decode_output.rnn_output.shape) def _sampled_loss(labels, logits): return tf.nn.sampled_softmax_loss( tf.transpose(projection_layer.kernel), projection_layer.bias, tf.expand_dims(labels, -1), logits, num_sampled=config['sampled_softmax'], num_classes=num_words) softmax_loss_function = _sampled_loss logits_input = decode_output.rnn_output else: softmax_loss_function = None logits_input = projection_layer(decode_output.rnn_output) losses = seq2seq.sequence_loss(logits_input, inputs['y'], mask, softmax_loss_function=softmax_loss_function, average_across_batch=False, average_across_timesteps=False) outputs['total_loss'] = tf.reduce_sum(losses) outputs['num_tokens'] = tf.reduce_sum(mask) outputs['loss'] = outputs['total_loss'] / outputs['num_tokens'] outputs['perplexity'] = tf.exp(outputs['loss'], name='perplexity') if stitch_inputs is not None: losses = seq2seq.sequence_loss(logits_input, inputs['y'], mask, average_across_batch=False) outputs['losses'] = tf.identity(losses, name='losses') outputs['perplexities'] = tf.exp(losses, name='perplexities') if stitch_inputs is None: with tf.variable_scope('Optimizer'): optimizer_name = config.get('optimizer', 'sgd') if optimizer_name == 'adam': logging.info('use adam optimizer') optimizer = tf.train.AdamOptimizer( learning_rate=inputs['learning_rate']) else: logging.info('use sgd optimizer') optimizer = tf.contrib.opt.MomentumWOptimizer( weight_decay=config['weight_decay'], learning_rate=inputs['learning_rate'], momentum=config['momentum']) if config.get('aggregation_method', 'default') == 'experimental': logging.info('use gradient aggregation method: experimental') gradient_var_pairs = optimizer.compute_gradients( outputs['total_loss'], var_list=tf.trainable_variables(), aggregation_method=tf.AggregationMethod. EXPERIMENTAL_ACCUMULATE_N) else: logging.info('use gradient aggregation method: default') gradient_var_pairs = optimizer.compute_gradients( outputs['total_loss'], var_list=tf.trainable_variables()) vars = [x[1] for x in gradient_var_pairs if x[0] is not None] gradients = [x[0] for x in gradient_var_pairs if x[0] is not None] gc = config.get('gradient_clipping', 120.0) gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=gc) outputs['train_op'] = optimizer.apply_gradients( zip(gradients, vars)) return inputs, outputs
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: # Building decoder_cell and decoder_initial_state ( self.decoder_cell, self.decoder_initial_state ) = self.build_decoder_cell(encoder_outputs, encoder_state) # 解码器embedding with tf.device(_get_embed_device(self.target_vocab_size)): if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable( tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size) ), trainable=True, name='embeddings' ) self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, (self.target_vocab_size, self.embedding_size) ) self.decoder_embeddings_init = \ self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32 ) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) if self.mode == 'train': # decoder_inputs_embedded: # [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train ) # Helper to feed inputs for training: # read inputs from dense ground truth vectors inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper' ) # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, # output_layer=self.decoder_output_projection ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length ) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: # if output_time_major=False: # [batch_size, max_time_step + 1, num_decoder_symbols] # if output_time_major=True: # [max_time_step + 1, batch_size, num_decoder_symbols] # decoder_outputs_train.sample_id: [batch_size], tf.int32 ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope ) # More efficient to do the projection # on the batch-time-concatenated tensor # logits_train: # [batch_size, max_time_step + 1, num_decoder_symbols] # 训练的时候一次性对所有的结果进行 output_layer 的投影运算 # 官方NMT库说这样能提高10~20%的速度 # 实际上我提高的速度会更大 self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output ) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) # Computes per word average cross-entropy over a batch # Internally calls # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose(decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax( decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards,其实我这里是修改了masks # train_entropy = cross entropy self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.tile( [WordSequence.START], [self.batch_size] ) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return tf.nn.embedding_lookup( self.decoder_embeddings, inputs ) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj ) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection ) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: # if output_time_major=False: # [batch_size, max_time_step, num_decoder_symbols] # if output_time_major=True # [max_time_step, batch_size, num_decoder_symbols] # decoder_outputs_decode.sample_id: # if output_time_major=False # [batch_size, max_time_step], tf.int32 # if output_time_major=True # [max_time_step, batch_size], tf.int32 # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: # if output_time_major=False: # [batch_size, max_time_step, beam_width] # if output_time_major=True # [max_time_step, batch_size, beam_width] # decoder_outputs_decode.beam_search_decoder_output: # BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) # 官方文档提到的一个潜在的最大长度选择 # 我这里改为 * 4 # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) # https://www.tensorflow.org/tutorials/seq2seq if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round(tf.reduce_max( self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope )) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: # [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax( # self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with # the result of the beamsearch decoder # decoder_pred_decode: # [batch_size, max_time_step, 1] (output_major=False) # self.decoder_pred_decode = tf.expand_dims( # self.decoder_outputs_decode.sample_id, # -1 # ) dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: # Use beam search to approximately # find the most likely translation # decoder_pred_decode: # [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def inference(self): with tf.variable_scope("embedding"): embedding = tf.get_variable( "embedding", shape=[self.vocab_size, self.embedding_size], initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32)) encoder_input_data_embedding = tf.nn.embedding_lookup( embedding, self.encoder_input_data) decoder_input_data_embedding = tf.nn.embedding_lookup( embedding, self.decoder_input_data) with tf.variable_scope("encoder"): en_lstm1 = rnn.BasicLSTMCell(256) en_lstm1 = rnn.DropoutWrapper(en_lstm1, output_keep_prob=self.keep_prob) en_lstm2 = rnn.BasicLSTMCell(256) en_lstm2 = rnn.DropoutWrapper(en_lstm2, output_keep_prob=self.keep_prob) encoder_cell_fw = rnn.MultiRNNCell([en_lstm1]) encoder_cell_bw = rnn.MultiRNNCell([en_lstm2]) bi_encoder_outputs, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn( encoder_cell_fw, encoder_cell_bw, encoder_input_data_embedding, sequence_length=self.input_seq_len, dtype=tf.float32) encoder_outputs = tf.concat(bi_encoder_outputs, -1) encoder_state = [] for layer_id in range(1): # layer_num encoder_state.append(bi_encoder_state[0][layer_id]) # forward encoder_state.append(bi_encoder_state[1][layer_id]) # backward encoder_state = tuple(encoder_state) with tf.variable_scope("decoder"): de_lstm1 = rnn.BasicLSTMCell(256) de_lstm1 = rnn.DropoutWrapper(de_lstm1, output_keep_prob=self.keep_prob) de_lstm2 = rnn.BasicLSTMCell(256) de_lstm2 = rnn.DropoutWrapper(de_lstm2, output_keep_prob=self.keep_prob) decoder_cell = rnn.MultiRNNCell([de_lstm1, de_lstm2]) attention_mechanism = seq2seq.LuongAttention( 256, encoder_outputs, self.input_seq_len) decoder_cell = seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, 256) decoder_initial_state = decoder_cell.zero_state(self.batch_size, dtype=tf.float32) decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) output_projection = Dense(self.vocab_size, name="output_projection") if self.is_train: helper = seq2seq.TrainingHelper(decoder_input_data_embedding, self.output_seq_len) decoder = seq2seq.BasicDecoder(decoder_cell, helper, decoder_initial_state, output_layer=output_projection) decoder_outputs, _, _ = seq2seq.dynamic_decode(decoder) logits = decoder_outputs.rnn_output pred = decoder_outputs.sample_id else: # #################SampleEmbedding################# helper = seq2seq.SampleEmbeddingHelper( embedding, start_tokens=[input_data.GO_ID] * self.batch_size, end_token=input_data.EOS_ID) # #################GreedyEmbedding################# # helper = seq2seq.GreedyEmbeddingHelper(embedding, # start_tokens=[input_data.GO_ID] * self.batch_size, # end_token=input_data.EOS_ID) decoder = seq2seq.BasicDecoder(decoder_cell, helper, decoder_initial_state, output_layer=output_projection) decoder_outputs, _, _ = seq2seq.dynamic_decode( decoder, maximum_iterations=10) logits = decoder_outputs.rnn_output pred = decoder_outputs.sample_id return logits, pred
def get_train_model(hps, vocab_size, img_feature_dim): # img_feature_dim 图像特征的维度 batch_size = hps.batch_size img_feature = tf.placeholder(tf.float32, (batch_size, img_feature_dim)) sentence = tf.placeholder(tf.int32, (batch_size, None)) input_sentence_len = tf.placeholder(tf.int32,shape =(batch_size,)) mask = tf.placeholder(tf.float32, (batch_size, None)) keep_prob = tf.placeholder(tf.float32, name='keep_prob') global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False) # prediction process # sentence: [a,b,c,d,e] 这个是ground trues # input:[img,a,b,c,d,e] 所以需要将img_feature reshape成embedding_word 类似的形状, # 好拼接在一起,第二个维度上面进行拼接 # img_feature:[0.4,0.3,10,2,5] # 下面是真实的预测场景 # predict #1: img_feature -> embedding_img ->lstm -> a # predict #2:a -> embedding_word -> lstm -> (b) # predict #3:b -> embedding_word -> lstm -> (c) # predict #4:c -> embedding_word -> lstm -> (d) # predict #5:d -> embedding_word -> lstm -> (e) # predict #6:e -> embedding_word -> lstm -> eos # Sets up the embedding layer. embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0) with tf.variable_scope('embedding', initializer=embedding_initializer): embeddings = tf.get_variable( 'embeddings', [vocab_size, hps.num_embedding_nodes], tf.float32) embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:-1]) # 应该还剩一个词语,剩下的多半是填充的。 img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0) with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init): embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes) embed_img = tf.layers.batch_normalization(embed_img) embed_img = tf.nn.relu(embed_img) embed_img = tf.expand_dims(embed_img, 1) # 这个是在第1这个维度上面扩展的,扩展后embed_img 变成了 # (batchsize,1,32) # embed_token_ids现在的形状是(batchsize,num_timesteps-1,32),所以下面进行了拼接。 embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1) # 现在embed_inputs的维度是(batchsize,num_timesteps,32) decoder_output_projection = layers.Dense( vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection' ) # Sets up LSTM network. scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0 lstm_init = tf.random_uniform_initializer(-scale, scale) with tf.variable_scope('lstm_nn', initializer=lstm_init) as train_scope: cells = [] for i in range(hps.num_lstm_layers): cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type) cell = dropout(cell, keep_prob) cells.append(cell) cell = tf.contrib.rnn.MultiRNNCell(cells) initial_state = cell.zero_state(hps.batch_size, tf.float32) # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[- # seq2seq的一个类,用来帮助feeding参数。 training_helper = seq2seq.TrainingHelper( inputs=embed_inputs, sequence_length=input_sentence_len, time_major=False, name='training_helper' ) training_decoder = seq2seq.BasicDecoder( cell=cell, helper=training_helper, initial_state=initial_state ) # decoder在当前的batch下的最大time_steps max_decoder_length = tf.reduce_max( input_sentence_len ) ( outputs, final_state, final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length, swap_memory=True, scope=train_scope ) # rnn_outputs, _ = tf.nn.dynamic_rnn(cell, # embed_inputs, # initial_state=initial_state) # 这里实际上是有一个sequence length的参数, # 但是实际操作中是忽略的这个参数,因为LSTM的输入拼接了图像特征的缘故吧,并且使用了mask来标明数据位和填充位 rnn_outputs = outputs.rnn_output print('rnn_outputs ', rnn_outputs) # Sets up the fully-connected layer.因为我们需要在[batch_size, num_timesteps, hps.num_lstm_node[-1]] 上的第3个维度上去做全连接 # 因此,我们需要把batch_size,num_timesteps 合并成1个维度,因此我们使用reshape函数。 # fc_init = tf.uniform_unit_scaling_initializer(factor=1.0) # with tf.variable_scope('lstm_nn/fc', initializer=fc_init): # rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hps.num_lstm_nodes[-1]]) # fc1 = tf.layers.dense(rnn_outputs_2d, hps.num_fc_nodes, name='fc1') # fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob) # fc1_dropout = tf.nn.relu(fc1_dropout) # logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits') decoder_logits_train = decoder_output_projection( outputs.rnn_output ) masks = tf.sequence_mask( lengths= input_sentence_len, maxlen=max_decoder_length, dtype=tf.float32, name='masks' ) with tf.variable_scope('loss'): ''' 这个tf.nn.sparse_softmax_cross_entropy_with_logits中,我们需要做三件事情: 1.对logits做softmax 2.对labels 做one-hot 编码,这label做了one-hot编码以后,形状也变成了(sentence,vocab_size),和logits的size相同, 正好可以作交叉熵。 3.计算它们的交叉熵 ''' loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=sentence, weights=masks, average_across_timesteps=True, average_across_batch=True ) logits_flatted = tf.reshape(decoder_logits_train,(-1,vocab_size)) prediction = tf.argmax(logits_flatted, 1, output_type=tf.int32) sentence_flatten = tf.reshape(sentence, [-1]) # 因为我们把LSTM的输出给展平了,所以sentence也需要展平 mask_flatten = tf.reshape(mask, [-1]) mask_flatten = tf.cast(mask_flatten,tf.float32) correct_prediction = tf.equal(prediction, sentence_flatten) print(correct_prediction.get_shape) print(mask_flatten.get_shape) correct_prediction_with_mask = tf.multiply( tf.cast(correct_prediction, tf.float32), mask_flatten) mask_sum = tf.reduce_sum(mask_flatten) accuracy = tf.reduce_sum(correct_prediction_with_mask) / mask_sum tf.summary.scalar('loss', loss) with tf.variable_scope('train_op'): tvars = tf.trainable_variables() for var in tvars: logging.info("variable name: %s" % (var.name)) grads, _ = tf.clip_by_global_norm( tf.gradients(loss, tvars), hps.clip_lstm_grads) for grad, var in zip(grads, tvars): tf.summary.histogram('%s_grad' % (var.name), grad) optimizer = tf.train.AdamOptimizer(hps.learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) return ((img_feature, sentence,input_sentence_len, mask, keep_prob), (loss,accuracy, train_op), global_step)
def build_decoder(self): print('build decoder...') with tf.variable_scope('decoder'): self.decoder_cell, self.decoder_initial_state = \ self.build_decoder_cell() self.decoder_embedding = tf.get_variable( name='embedding', shape=[self.para.decoder_vocab_size, self.para.embedding_size], dtype=self.dtype) output_projection_layer = Dense(units=self.para.decoder_vocab_size, name='output_projection') if self.para.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embedding, ids=self.decoder_inputs) if self.para.scheduled_sampling == 0: training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_len, name='training_helper') else: self.sampling_probability = tf.cond( self.global_step < self.para.start_decay_step * 2, lambda: tf.cast(tf.divide( self.global_step, self.para.start_decay_step * 2), dtype=self.dtype), lambda: tf.constant(1.0, dtype=self.dtype), name='sampling_probability') training_helper = seq2seq.ScheduledEmbeddingTrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_len, embedding=self.decoder_embedding, sampling_probability=self.sampling_probability, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_projection_layer) max_decoder_length = tf.reduce_max(self.decoder_inputs_len) self.decoder_outputs, decoder_states, decoder_outputs_len = \ seq2seq.dynamic_decode( decoder=training_decoder, maximum_iterations=max_decoder_length ) rnn_output = self.decoder_outputs.rnn_output # rnn_output should be padded to max_len # calculation of loss will be handled by masks self.rnn_output_padded = tf.pad(rnn_output, \ [[0, 0], [0, self.para.max_len - tf.shape(rnn_output)[1]], [0, 0]] \ ) self.loss = self.compute_loss(logits=self.rnn_output_padded, labels=self.decoder_targets) elif self.para.mode == 'test': start_tokens = tf.fill([self.para.batch_size], 1) if self.para.beam_search == 0: inference_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=2, embedding=self.decoder_embedding) inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=inference_helper, initial_state=self.decoder_initial_state, output_layer=output_projection_layer) else: inference_decoder = seq2seq.BeamSearchDecoder( cell=self.decoder_cell, embedding=self.decoder_embedding, start_tokens=start_tokens, end_token=2, initial_state=self.decoder_initial_state, beam_width=self.para.beam_width, output_layer=output_projection_layer) self.decoder_outputs, decoder_states, decoder_outputs_len = \ seq2seq.dynamic_decode( decoder=inference_decoder, maximum_iterations=self.para.max_len ) if self.para.beam_search == 0: # self.decoder_predictions_id: [batch_size, max_len, 1] self.decoder_predicted_ids = tf.expand_dims( \ input=self.decoder_outputs.sample_id, \ axis=-1 \ ) else: # self.decoder_predicted_ids: [batch_size, <= max_len, beam_width] self.decoder_predicted_ids = self.decoder_outputs.predicted_ids
def build_decoder(self): with tf.variable_scope("decoder"): decoder_cell, decoder_initial_state = self.build_decoder_cell() # start tokens : [batch_size], which is fed to BeamsearchDecoder during inference start_tokens = tf.ones([self.batch_size], dtype=tf.int32) * data_util.ID_GO end_token = data_util.ID_EOS input_layer = Dense(self.state_size, dtype=tf.float32, name="input_layer") output_layer = Dense(self.decoder_vocab_size, name="output_projection") if self.mode == "train": # feed ground truth decoder input token every time step decoder_input_lookup = tf.nn.embedding_lookup( self.embedding_matrix, self.decoder_input) decoder_input_lookup = input_layer(decoder_input_lookup) training_helper = seq2seq.TrainingHelper( inputs=decoder_input_lookup, sequence_length=self.decoder_train_len, name="training_helper") training_decoder = seq2seq.BasicDecoder( cell=decoder_cell, initial_state=decoder_initial_state, helper=training_helper, output_layer=output_layer) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 max_decoder_len = tf.reduce_max(self.decoder_train_len) decoder_outputs_train, final_state, _ = seq2seq.dynamic_decode( training_decoder, impute_finished=True, swap_memory=True, maximum_iterations=max_decoder_len) self.decoder_logits_train = tf.identity( decoder_outputs_train.rnn_output) decoder_pred = tf.argmax(self.decoder_logits_train, axis=2) # sequence mask for get valid sequence except zero padding weights = tf.sequence_mask(self.decoder_len, maxlen=max_decoder_len, dtype=tf.float32) # compute cross entropy loss for all sequence prediction and ignore loss from zero padding self.loss = seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_target, weights=weights, average_across_batch=True, average_across_timesteps=True) tf.summary.scalar("loss", self.loss) with tf.variable_scope("train_optimizer") and tf.device( "/device:GPU:1"): # use AdamOptimizer and clip gradient by max_norm 5.0 # use global step for counting every iteration params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, _ = tf.clip_by_global_norm( gradients, 5.0) learning_rate = tf.train.exponential_decay( self.lr, self.global_step, 100000, 0.96) opt = tf.train.AdagradOptimizer(learning_rate) self.train_op = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) elif self.mode == "test": def embedding_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.embedding_matrix, inputs)) inference_decoder = seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=embedding_proj, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_depth, output_layer=output_layer) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) with tf.device("/device:GPU:1"): decoder_outputs, decoder_last_state, decoder_output_length = \ seq2seq.dynamic_decode(decoder=inference_decoder, output_time_major=False, swap_memory=False, maximum_iterations=self.max_iter) self.decoder_pred_test = decoder_outputs.predicted_ids
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, is_generating, beam_size, max_generation_length): src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) src_sequence_length = tf.placeholder(tf.int32, shape=[ None, ]) src_embedding_weights = tf.get_variable("source_word_embeddings", [source_dict_dim, embedding_dim]) src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) # no peephole encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=src_forward_cell, cell_bw=src_reversed_cell, inputs=src_embedding, sequence_length=src_sequence_length, dtype=tf.float32) # concat the forward outputs and backward outputs encoded_vec = tf.concat(encoder_outputs, axis=2) # project the encoder outputs to size of decoder lstm encoded_proj = tf.contrib.layers.fully_connected(inputs=tf.reshape( encoded_vec, shape=[-1, embedding_dim * 2]), num_outputs=decoder_size, activation_fn=None, biases_initializer=None) encoded_proj_reshape = tf.reshape( encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) # get init state for decoder lstm's H backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) decoder_boot = tf.contrib.layers.fully_connected(inputs=tf.reshape( backword_first, shape=[-1, embedding_dim]), num_outputs=decoder_size, activation_fn=tf.nn.tanh, biases_initializer=None) # prepare the initial state for decoder lstm cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) initial_state = LSTMStateTuple(cell_init, decoder_boot) # create decoder lstm cell decoder_cell = LSTMCellWithSimpleAttention( decoder_size, encoded_vec if not is_generating else seq2seq.tile_batch( encoded_vec, beam_size), encoded_proj_reshape if not is_generating else seq2seq.tile_batch( encoded_proj_reshape, beam_size), src_sequence_length if not is_generating else seq2seq.tile_batch( src_sequence_length, beam_size), forget_bias=0.0) output_layer = Dense(target_dict_dim, name='output_projection') if not is_generating: trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) trg_sequence_length = tf.placeholder(tf.int32, shape=[ None, ]) trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, trg_word_idx) training_helper = seq2seq.TrainingHelper( inputs=trg_embedding, sequence_length=trg_sequence_length, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) # get the max length of target sequence max_decoder_length = tf.reduce_max(trg_sequence_length) decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask(lengths=trg_sequence_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # place holder of label sequence lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) # compute the loss loss = seq2seq.sequence_loss(logits=decoder_logits_train, targets=lbl_word_idx, weights=masks, average_across_timesteps=True, average_across_batch=True) # return feeding list and loss operator return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length, 'trg_word_idx': trg_word_idx, 'trg_sequence_length': trg_sequence_length, 'lbl_word_idx': lbl_word_idx }, loss else: start_tokens = tf.ones([ tf.shape(src_word_idx)[0], ], tf.int32) * START_TOKEN_IDX # share the same embedding weights with target word trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=decoder_cell, embedding=lambda tokens: tf.nn.embedding_lookup( trg_embedding_weights, tokens), start_tokens=start_tokens, end_token=END_TOKEN_IDX, initial_state=tf.nn.rnn_cell.LSTMStateTuple( tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), beam_width=beam_size, output_layer=output_layer) decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True,# error occurs maximum_iterations=max_generation_length) predicted_ids = decoder_outputs_decode.predicted_ids return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length }, predicted_ids
def build_model(self): """ 建立seq2seq模型 """ self.query_input = tf.placeholder(tf.int32, [None, None]) self.query_length = tf.placeholder(tf.int32, [None]) self.answer_input = tf.placeholder(tf.int32, [None, None]) self.answer_target = tf.placeholder(tf.int32, [None, None]) self.answer_length = tf.placeholder(tf.int32, [None]) self.batch_size = array_ops.shape(self.query_input)[0] if self.mode == "train": self.max_decode_step = tf.reduce_max(self.answer_length) self.sequence_mask = tf.sequence_mask(self.answer_length, self.max_decode_step, dtype=tf.float32) elif self.mode == "decode": self.max_decode_step = tf.reduce_max(self.query_length) * 10 # input and output embedding # 词变为向量 self.embeddings_matrix = tf.Variable(tf.random_uniform([ self.vocab_size, EMBEDDING_SIZE], -1.0, 1.0), dtype=tf.float32) self.query_embeddings = tf.nn.embedding_lookup(self.embeddings_matrix, self.query_input) self.answer_embeddings = tf.nn.embedding_lookup(self.embeddings_matrix, self.answer_input) # encoder process self.encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn( rnn.BasicLSTMCell(ENCODER_HIDDEN_SIZE), self.query_embeddings, sequence_length=self.query_length, dtype=tf.float32) # 通过beam search 加工出一批临时变量,后续复用 batch_size, encoder_outputs, encoder_state, encoder_length = (self.batch_size, self.encoder_outputs, self.encoder_state, self.query_length) if self.mode == "decode": batch_size = batch_size * BEAM_WIDTH encoder_outputs = seq2seq.tile_batch(t=self.encoder_outputs, multiplier=BEAM_WIDTH) encoder_state = nest.map_structure(lambda s: seq2seq.tile_batch( t=s, multiplier=BEAM_WIDTH), self.encoder_state) encoder_length = seq2seq.tile_batch(t=self.query_length, multiplier=BEAM_WIDTH) # attention wrapper self.attention_mechanism = seq2seq.BahdanauAttention(num_units=ENCODER_HIDDEN_SIZE, memory=encoder_outputs, memory_sequence_length=encoder_length) self.decoder_cell = seq2seq.AttentionWrapper(rnn.BasicLSTMCell(DECODER_HIDDEN_SIZE), attention_mechanism=self.attention_mechanism, attention_layer_size=ATTENTION_SIZE) self.decoder_initial_state = self.decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) self.decoder_dense = tf.layers.Dense(self.vocab_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # 如果是训练过程,使用training helper, 否则使用greedyhelper或beamsearch helper if self.mode == "train": training_helper = seq2seq.TrainingHelper(inputs=self.answer_embeddings, sequence_length=self.answer_length) training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_dense) decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_decode_step) self.decoder_logits = tf.identity(decoder_outputs.rnn_output) self.loss = seq2seq.sequence_loss(logits=decoder_outputs.rnn_output, targets=self.answer_target, weights=self.sequence_mask) self.sample_ids = decoder_outputs.sample_id self.optimizer = tf.train.AdamOptimizer(LR_RATE) self.train_op = self.optimizer.minimize(self.loss) tf.summary.scalar('loss', self.loss) self.summary_op = tf.summary.merge_all() elif self.mode == "decode": start_tokens = tf.ones([self.batch_size], tf.int32) * self.go end_token = self.eos # 在beam search的情况下,给beam search helper传递的值,不需要使用BEAM_WIDTH的tensor # 此处使用beam_search/greedy helper解码都可以,如果只回复1条时等价 if USE_BEAMSEARCH: inference_decoder = seq2seq.BeamSearchDecoder(cell=self.decoder_cell, embedding=self.embeddings_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=BEAM_WIDTH, output_layer=self.decoder_dense) # 使用beam_search的时候,结果是predicted_ids, beam_search_decoder_output # predicted_ids: [batch_size, decoder_targets_length, beam_size] # beam_search_decoder_output: scores, predicted_ids, parent_ids decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=inference_decoder, maximum_iterations=self.max_decode_step) self.sample_ids = decoder_outputs.predicted_ids self.sample_ids = tf.transpose(self.sample_ids, perm=[0, 2, 1]) # 转置成行句子 else: decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens, end_token=end_token, embedding=self.embeddings_matrix) inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_dense) # 不使用beam_search的时候,结果是rnn_outputs, sample_id, # rnn_output: [batch_size, decoder_targets_length, vocab_size] # sample_id: [batch_size, decoder_targets_length], tf.int32 self.decoder_outputs_decode, self.final_state, _ = seq2seq.dynamic_decode( decoder=inference_decoder, maximum_iterations=self.max_decode_step) self.sample_ids = self.decoder_outputs_decode.sample_id
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell( ) input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3), dtype=self.dtype) self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], initializer=initializer, dtype=self.dtype) self.decoder_encoded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) self.decoder_inputs_encoded = input_layer(self.decoder_encoded) print(" Decoder input encoded is ", self.decoder_inputs_encoded.shape) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_encoded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, swap_memory=True, maximum_iterations=self.max_decoder_length)) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask( lengths=self.decoder_inputs_length_masks, maxlen=self.max_decoder_length, dtype=self.dtype, name='masks') print("logits train shape is ", self.decoder_logits_train.shape) print("decoder_targets_train train shape is ", self.decoder_targets_train.shape) self.loss = tf.reduce_sum( seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=False, average_across_batch=True, )) # Compute predictions self.accuracy, self.accuracy_op = tf.metrics.accuracy( labels=self.decoder_targets_train, predictions=self.decoder_pred_train, name="accuracy") # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) tf.summary.scalar('teacher_forcing_accuracy', self.accuracy) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': self.decoder_embeddings = tf.get_variable( name='embedding', shape=[self.num_decoder_symbols, self.embedding_size], dtype=self.dtype) # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.dest_start_token_index end_token = self.dest_eos_token_index def embed_and_input_proj(inputs): encoded_input = tf.nn.embedding_lookup( self.decoder_embeddings, inputs) return input_layer(encoded_input) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer, ) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, swap_memory=True, maximum_iterations=self.max_decode_step)) if not self.use_beamsearch_decode: # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def _build_decoder(model, encoder_outputs, encoder_state, hparams, start_token, end_token, output_layer, aux_hidden_state): """build decoder for the seq2seq model.""" iterator = model.iterator start_token_id = tf.cast( model.vocab_table.lookup(tf.constant(start_token)), tf.int32) end_token_id = tf.cast( model.vocab_table.lookup(tf.constant(end_token)), tf.int32) start_tokens = tf.fill([model.batch_size], start_token_id) end_token = end_token_id ## Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = _build_decoder_cell( model, hparams, encoder_state, base_gpu=model.global_gpu_num) model.global_gpu_num += hparams.num_layers # ## Train or eval decoder_emb_inp = tf.nn.embedding_lookup(model.embedding_decoder, iterator.target) # Helper helper_train = help_py.TrainingHelper( decoder_emb_inp, iterator.dialogue_len, time_major=False) # Decoder my_decoder_train = basic_decoder.BasicDecoder( cell, helper_train, decoder_initial_state, encoder_outputs, iterator.turns, output_layer=output_layer, aux_hidden_state=aux_hidden_state) # Dynamic decoding outputs_train, _, _ = seq2seq.dynamic_decode( my_decoder_train, output_time_major=False, swap_memory=True, scope=decoder_scope) sample_id_train = outputs_train.sample_id logits_train = outputs_train.rnn_output ## Inference # else: beam_width = hparams.beam_width length_penalty_weight = hparams.length_penalty_weight if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0: my_decoder_infer = seq2seq.BeamSearchDecoder( cell=cell, embedding=model.embedding_decoder, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=beam_width, output_layer=output_layer, length_penalty_weight=length_penalty_weight) else: # Helper if model.mode in dialogue_utils.self_play_modes: helper_infer = seq2seq.SampleEmbeddingHelper( model.embedding_decoder, start_tokens, end_token) else: # inference helper_infer = seq2seq.GreedyEmbeddingHelper( model.embedding_decoder, start_tokens, end_token) # Decoder my_decoder_infer = seq2seq.BasicDecoder( cell, helper_infer, decoder_initial_state, output_layer=output_layer # applied per timestep ) # Dynamic decoding outputs_infer, _, _ = seq2seq.dynamic_decode( my_decoder_infer, maximum_iterations=hparams.max_inference_len, output_time_major=False, swap_memory=True, scope=decoder_scope) if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0: logits_infer = tf.no_op() sample_id_infer = outputs_infer.predicted_ids else: logits_infer = outputs_infer.rnn_output sample_id_infer = outputs_infer.sample_id return logits_train, logits_infer, sample_id_train, sample_id_infer
def build_decoder(self): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: # Building decoder_cell and decoder_initial_state (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell() # 解码器embedding if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings else: with tf.device(_get_embed_device(self.target_vocab_size)): self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # On Using Very Large Target Vocabulary # for Neural Machine Translation # https://arxiv.org/pdf/1412.2007v2.pdf # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims hidden_units = self.hidden_units if self.bidirectional: hidden_units *= 2 input_layer = layers.Dense(hidden_units, dtype=tf.float32, use_bias=False, name='input_projection') self.output_layer = layers.Dense(self.target_vocab_size, dtype=tf.float32, use_bias=False, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: # [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: # read inputs from dense ground truth vectors inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length_train, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, # output_layer=self.output_layer ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: # if output_time_major=False: # [batch_size, max_time_step + 1, num_decoder_symbols] # if output_time_major=True: # [max_time_step + 1, batch_size, num_decoder_symbols] # decoder_outputs_train.sample_id: [batch_size], tf.int32 ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) # More efficient to do the projection # on the batch-time-concatenated tensor # logits_train: # [batch_size, max_time_step + 1, num_decoder_symbols] # 训练的时候一次性对所有的结果进行 output_layer 的投影运算 # 官方NMT库说这样能提高10~20%的速度 # 实际上我提高的速度会更大 self.decoder_logits_train = self.output_layer( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # Computes per word average cross-entropy over a batch # Internally calls # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于强化学习训练 self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_targets_train, logits=decoder_logits_train) # self.train_entropy *= self.masks # print(self.train_entropy.shape) self.train_entropy_rewards = tf.multiply( self.train_entropy, self.rewards) # print('self.train_entropy_rewards.shape', self.train_entropy_rewards.shape) self.train_entropy_rewards *= self.masks # https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/seq2seq/python/ops/loss.py # if average_across_timesteps and average_across_batch: # crossent = math_ops.reduce_sum(crossent) # total_size = math_ops.reduce_sum(weights) # total_size += 1e-12 # to avoid division by 0 for all-0 weights # crossent /= total_size self.loss_without_rewards = tf.reduce_sum(self.train_entropy) self.loss_rewards = tf.reduce_sum(self.train_entropy_rewards) total_size = tf.reduce_sum(self.masks) total_size += 1e-12 self.loss_without_rewards /= total_size self.loss_rewards /= total_size self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_targets_train, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.fill([self.batch_size], WordSequence.START) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.output_layer, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: # if output_time_major=False: # [batch_size, max_time_step, num_decoder_symbols] # if output_time_major=True # [max_time_step, batch_size, num_decoder_symbols] # decoder_outputs_decode.sample_id: # if output_time_major=False # [batch_size, max_time_step], tf.int32 # if output_time_major=True # [max_time_step, batch_size], tf.int32 # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: # if output_time_major=False: # [batch_size, max_time_step, beam_width] # if output_time_major=True # [max_time_step, batch_size, beam_width] # decoder_outputs_decode.beam_search_decoder_output: # BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) # 官方文档提到的一个潜在的最大长度选择 # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) # https://www.tensorflow.org/tutorials/seq2seq if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: # [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax( # self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with # the result of the beamsearch decoder # decoder_pred_decode: # [batch_size, max_time_step, 1] (output_major=False) # self.decoder_pred_decode = tf.expand_dims( # self.decoder_outputs_decode.sample_id, # -1 # ) dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: # Use beam search to approximately # find the most likely translation # decoder_pred_decode: # [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def __init__(self, vocab_size, hidden_size, dropout, num_layers, max_gradient_norm, batch_size, learning_rate, lr_decay_factor, max_target_length, max_source_length, decoder_mode=False): ''' vocab_size: number of vocab tokens buckets: buckets of max sequence lengths hidden_size: dimension of hidden layers num_layers: number of hidden layers max_gradient_norm: maximum gradient magnitude batch_size: number of training examples fed to network at once learning_rate: starting learning rate of network lr_decay_factor: amount by which to decay learning rate num_samples: number of samples for sampled softmax decoder_mode: Whether to build backpass nodes or not ''' GO_ID = config.GO_ID EOS_ID = config.EOS_ID self.max_source_length = max_source_length self.max_target_length = max_target_length self.vocab_size = vocab_size self.batch_size = batch_size self.global_step = tf.Variable(0, trainable=False) self.learning_rate = learning_rate self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') self.source_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='source_lengths') self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') self.target_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name="target_lengths") with tf.variable_scope('embeddings') as scope: embeddings = tf.Variable(tf.random_uniform([vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32) encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_inputs) targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets) with tf.variable_scope('encoder') as scope: encoder_cell = rnn.LSTMCell(hidden_size) encoder_cell = rnn.DropoutWrapper(encoder_cell, input_keep_prob=dropout) encoder_cell = rnn.MultiRNNCell([encoder_cell] * num_layers) _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, sequence_length=self.source_lengths, inputs=encoder_inputs_embedded, dtype=tf.float32, time_major=False) with tf.variable_scope('decoder') as scope: decoder_cell = rnn.LSTMCell(hidden_size) decoder_cell = rnn.DropoutWrapper(decoder_cell, input_keep_prob=dropout) decoder_cell = rnn.MultiRNNCell([decoder_cell] * num_layers) #TODO add attention #seq2seq.BahdanauAttention(num_units=,memory=encoder_output) #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell, # attention_mechanism=) if decoder_mode: decoder = seq2seq.BeamSearchDecoder(embedding=embeddings, start_tokens=tf.tile([GOD_ID], [batch_size]), end_token=EOS_ID, initial_state=encoder_state[0], beam_width=2) else: helper = seq2seq.TrainingHelper(inputs=targets_embedding, sequence_length=self.target_lengths) decoder = seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=encoder_state[-1], output_layer=Dense(vocab_size)) final_outputs, final_state, final_sequence_lengths =\ seq2seq.dynamic_decode(decoder=decoder) self.logits = final_outputs.rnn_output if not decoder_mode: with tf.variable_scope("loss") as scope: #have to pad logits, dynamic decode produces results not consistent #in shape with targets pad_size = self.max_target_length - tf.reduce_max(final_sequence_lengths) self.logits = tf.pad(self.logits, [[0, 0], [0,pad_size], [0, 0]]) weights = tf.sequence_mask(lengths=final_sequence_lengths, maxlen=self.max_target_length, dtype=tf.float32, name='weights') x_entropy_loss = seq2seq.sequence_loss(logits=self.logits, targets=self.decoder_targets, weights=weights) self.loss = tf.reduce_mean(x_entropy_loss) optimizer = tf.train.AdamOptimizer() gradients = optimizer.compute_gradients(x_entropy_loss) capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(capped_grads, global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables())
def build_decoder(self, encoder_outputs, encoder_state): """构建解码器""" with tf.variable_scope('decoder') as decoder_scope: (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell( encoder_outputs, encoder_state) # 解码器embedding: # 判断发生在GPU/CPU上 with tf.device(_get_embed_device(self.target_vocab_size)): # decoder与encoder是否共享embedding if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings # decoder与encoder不共享但预训练过 elif self.pretrained_embedding: self.decoder_embeddings = tf.Variable(tf.constant( 0.0, shape=(self.target_vocab_size, self.embedding_size)), trainable=True, name='embeddings') self.decoder_embeddings_placeholder = tf.placeholder( tf.float32, shape=(self.target_vocab_size, self.embedding_size)) self.decoder_embeddings_init = self.decoder_embeddings.assign( self.decoder_embeddings_placeholder) # decoder与encoder不共享并且未预训练 else: self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # 输出(全连接层) self.decoder_output_projection = layers.Dense( self.target_vocab_size, dtype=tf.float32, use_bias=False, name='decoder_output_projection') # 训练模式 if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length, time_major=self.time_major, name='training_helper') # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max(self.decoder_inputs_length) ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) self.decoder_logits_train = self.decoder_output_projection( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于特殊的学习训练 # 自定义rewards # train_entropy = cross entropy self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_inputs, logits=decoder_logits_train) self.masks_rewards = self.masks * self.rewards self.loss_rewards = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks_rewards, average_across_timesteps=True, average_across_batch=True, ) self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) self.loss_add = self.loss + self.add_loss # 预测模式,非训练 elif self.mode == 'decode': start_tokens = tf.tile([WordSequence.START], [self.batch_size]) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper""" return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.decoder_output_projection) else: # Beamsearch is used to approximately # find the most likely translation inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.decoder_output_projection, ) if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) if not self.use_beamsearch_decode: dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build_train_decoder(self): self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.embedding, ids=self.decoder_inputs_train) if self.train_mode == 'ground_truth': training_helper = seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') elif self.train_mode == 'scheduled_sampling': training_helper = seq2seq.ScheduledEmbeddingTrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, embedding=lambda inputs: tf.nn.embedding_lookup( self.embedding, inputs), sampling_probability=self.sampling_probability, name='scheduled_embedding_training_helper') else: raise NotImplementedError( 'Train mode: {} is not yet implemented'.format( self.train_mode)) training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train) self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) # NOTE(sdsuo): Not sure why this is necessary self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer()
def BuildNetwork(self, learningRate): self.dataInput = tensorflow.placeholder(dtype=tensorflow.float32, shape=[None, None, 40], name='DataInput') self.labelInput = tensorflow.placeholder(dtype=tensorflow.float32, name='LabelInput') self.seqInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='SeqInput') ############################################################################# # Batch Parameters ############################################################################# self.parameters['BatchSize'], self.parameters[ 'TimeStep'], _ = tensorflow.unstack( tensorflow.shape(input=self.dataInput, name='DataShape')) ################################################################################################### # Encoder ################################################################################################### with tensorflow.variable_scope('Encoder_AE'): self.parameters[ 'Encoder_Cell_Forward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters[ 'Encoder_Cell_Backward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Encoder_Output_AE'], self.parameters['Encoder_FinalState_AE'] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['Encoder_Cell_Forward_AE'], cell_bw=self.parameters['Encoder_Cell_Backward_AE'], inputs=self.dataInput, sequence_length=self.seqInput, dtype=tensorflow.float32) if self.attention is None: self.parameters['Decoder_InitalState_AE'] = [] for index in range(self.rnnLayers): self.parameters[ 'Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple( c=tensorflow.concat([ self.parameters['Encoder_FinalState_AE'][index] [0].c, self.parameters['Encoder_FinalState_AE'] [index][1].c ], axis=1), h=tensorflow.concat([ self.parameters['Encoder_FinalState_AE'][index] [0].h, self.parameters['Encoder_FinalState_AE'] [index][1].h ], axis=1)) self.parameters['Decoder_InitalState_AE'].append( self.parameters['Encoder_Cell_Layer%d_AE' % index]) self.parameters['Decoder_InitalState_AE'] = tuple( self.parameters['Decoder_InitalState_AE']) else: self.attentionList = self.attention( dataInput=self.parameters['Encoder_Output_AE'], scopeName=self.attentionName, hiddenNoduleNumber=2 * self.hiddenNodules, attentionScope=self.attentionScope, blstmFlag=True) self.parameters['Decoder_InitalState_AE'] = [] for index in range(self.rnnLayers): self.parameters[ 'Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple( c=self.attentionList['FinalResult'], h=tensorflow.concat([ self.parameters['Encoder_FinalState_AE'][index] [0].h, self.parameters['Encoder_FinalState_AE'] [index][1].h ], axis=1)) self.parameters['Decoder_InitalState_AE'].append( self.parameters['Encoder_Cell_Layer%d_AE' % index]) self.parameters['Decoder_InitalState_AE'] = tuple( self.parameters['Decoder_InitalState_AE']) ############################################################################# # Decoder Label Pretreatment ############################################################################# self.parameters['Decoder_Helper_AE'] = seq2seq.TrainingHelper( inputs=self.dataInput, sequence_length=self.seqInput, name='Decoder_Helper_AE') with tensorflow.variable_scope('Decoder_AE'): self.parameters['Decoder_FC_AE'] = Dense(40) self.parameters[ 'Decoder_Cell_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules * 2) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Decoder_AE'] = seq2seq.BasicDecoder( cell=self.parameters['Decoder_Cell_AE'], helper=self.parameters['Decoder_Helper_AE'], initial_state=self.parameters['Decoder_InitalState_AE'], output_layer=self.parameters['Decoder_FC_AE']) self.parameters['Decoder_Logits_AE'], self.parameters[ 'Decoder_FinalState_AE'], self.parameters[ 'Decoder_FinalSeq_AE'] = seq2seq.dynamic_decode( decoder=self.parameters['Decoder_AE']) ############################################################################# # Losses ############################################################################# self.parameters['Loss_AE'] = tensorflow.losses.absolute_difference( labels=self.dataInput, predictions=self.parameters['Decoder_Logits_AE'][0], weights=self.weight) self.trainAE = tensorflow.train.AdamOptimizer( learning_rate=learningRate).minimize(self.parameters['Loss_AE']) ############################################################################# # DBLSTM Second BLSTM ############################################################################# with tensorflow.variable_scope('FirstBLSTM'): self.parameters[ 'First_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters[ 'First_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['First_Output'], self.parameters['First_FinalState'] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['First_Cell_Forward'], cell_bw=self.parameters['First_Cell_Backward'], inputs=self.dataInput, sequence_length=self.seqInput, dtype=tensorflow.float32) if self.attention is None: self.parameters['First_FinalOutput'] = tensorflow.concat([ self.parameters['First_FinalState'][self.rnnLayers - 1][0].h, self.parameters['First_FinalState'][self.rnnLayers - 1][1].h ], axis=1) else: self.firstAttentionList = self.attention( dataInput=self.parameters['First_Output'], scopeName=self.attentionName + '_DBLTM', hiddenNoduleNumber=2 * self.hiddenNodules, attentionScope=self.attentionScope, blstmFlag=True) self.parameters['First_FinalOutput'] = self.firstAttentionList[ 'FinalResult'] if self.concatType == 'Concat': self.parameters['First_Concat'] = tensorflow.concat( [ self.parameters['First_FinalOutput'], self.attentionList['FinalResult'] ], axis=1, name='First_Concat') if self.concatType == 'Plus': self.parameters['First_Concat'] = tensorflow.add( self.parameters['First_FinalOutput'], self.attentionList['FinalResult'], name='First_Plus') if self.concatType == 'Multiply': self.parameters['First_Concat'] = tensorflow.multiply( self.parameters['First_FinalOutput'], self.attentionList['FinalResult'], name='First_Multiply') with tensorflow.variable_scope('SecondBLSTM'): self.parameters[ 'Second_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters[ 'Second_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Second_Output'], self.parameters['Second_FinalState'] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['Second_Cell_Forward'], cell_bw=self.parameters['Second_Cell_Backward'], inputs=self.parameters['First_Concat'][tensorflow.newaxis, :, :], dtype=tensorflow.float32) if self.secondAttention is None: self.parameters['Second_FinalOutput'] = tensorflow.concat([ self.parameters['Second_FinalState'][self.rnnLayers - 1][0].h, self.parameters['Second_FinalState'][self.rnnLayers - 1][1].h ], axis=1) else: self.secondAttentionList = self.secondAttention( dataInput=self.parameters['Second_Output'], scopeName=self.secondAttentionName, hiddenNoduleNumber=2 * self.hiddenNodules, attentionScope=self.secondAttentionScope, blstmFlag=True) self.parameters['Second_FinalOutput'] = self.secondAttentionList[ 'FinalResult'] self.parameters['FinalPredict'] = tensorflow.reshape( tensor=tensorflow.layers.dense( inputs=self.parameters['Second_FinalOutput'], units=1, activation=None, name='FinalPredict'), shape=[1]) if self.lossType == 'MSE': self.parameters['Loss'] = tensorflow.losses.mean_squared_error( labels=self.labelInput, predictions=self.parameters['FinalPredict']) if self.lossType == 'RMSE': self.parameters['Loss'] = tensorflow.sqrt( tensorflow.losses.mean_squared_error( labels=self.labelInput, predictions=self.parameters['FinalPredict'])) if self.lossType == 'MAE': self.parameters['Loss'] = tensorflow.losses.absolute_difference( labels=self.labelInput, predictions=self.parameters['FinalPredict']) self.train = tensorflow.train.AdamOptimizer( learning_rate=learningRate).minimize( self.parameters['Loss'], var_list=tensorflow.global_variables() [NETWORK_LENGTH[self.attentionName]:])
def create_model_predict(self, input, mode='decode'): use_beam_search = False if self.params.beam_with > 1: use_beam_search = True with tf.variable_scope("attetnion_seq2seq", reuse=tf.AUTO_REUSE): embeddings_matrix = self._create_embedding() keep_prob = 1 - self.params.dropout_rate batch_size = tf.shape(input)[0] # encoder encoder_outputs, encoder_last_states, encoder_inputs_length = self._create_encoder( embeddings_matrix, input, keep_prob) # decoder with tf.variable_scope('decoder'): # # Output projection layer to convert cell_outpus to logits output_layer = Dense(self.params.vocab_size, name='output_project') input_layer = Dense(self.params.hidden_units * 2, dtype=tf.float32, name='input_projection') decoder_cell, decoder_initial_state = create_decoder_cell( enc_outputs=encoder_outputs, enc_states=encoder_last_states, enc_seq_len=encoder_inputs_length, num_layers=self.params.depth, num_units=self.params.hidden_units * 2, keep_prob=keep_prob, use_residual=self.params.use_residual, use_beam_search=use_beam_search, beam_size=self.params.beam_with, batch_size=batch_size, top_attention=self.params.top_attention) # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([ batch_size, ], tf.int32) * data_utils.GO_ID end_token = data_utils.EOS_ID def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(embeddings_matrix, inputs)) if self.params.beam_with <= 1: decode_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) inference_decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=decode_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_output, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.params.max_seq_length) else: inference_decoder = seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.params.beam_with, output_layer=output_layer) decoder_output, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, maximum_iterations=self.params.max_seq_length) if self.params.beam_with <= 1: decoder_predict = tf.expand_dims(decoder_output.sample_id, -1) else: decoder_predict = decoder_output.predicted_ids decoder_predict = tf.identity(decoder_predict, 'predicts') return decoder_predict
def build_decoder(self): print("building decoder and attention..") with tf.variable_scope('decoder'): self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell() initializer = tf.contrib.layers.xavier_initializer(seed=0, dtype=self.dtype) self.decoder_embeddings = tf.get_variable(name='embedding', shape=[self.num_decoder_symbols, self.decoder_embedding_size], initializer=initializer, dtype=self.dtype) input_layer = Dense(self.decoder_hidden_units, dtype=self.dtype, name='input_projection') output_layer = Dense(self.num_decoder_symbols, name='output_projection') if self.mode == 'train': self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) self.decoder_inputs_embedded = input_layer(self.decoder_inputs_embedded) training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) #output_layer=None) max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train) (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length)) self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output) self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=self.dtype, name='masks') self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True,) tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.init_optimizer() elif self.mode == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token end_token = data_utils.end_token def embed_and_input_proj(inputs): return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=output_layer) else: # Beamsearch is used to approximately find the most likely translation print("building beamsearch decoder..") inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=output_layer,) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) (self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True, # error occurs maximum_iterations=self.max_decode_step)) ### get alignment from decoder_last_state if self.use_attention: self.alignment = self.decoder_last_state_decode[0].alignment_history.stack() else: self.alignment = [] if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id, -1) else: # Use beam search to approximately find the most likely translation # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids