def _build_train(self, config): # decode if config.model_name == "fasttext_flat": self.logits = tf.contrib.layers.fully_connected( self.first_attention, config.fn_classes, activation_fn=None) print("logits:", self.logits.get_shape()) self.logits = tf.reshape(self.logits, [-1, config.fn_classes]) elif config.model_name == "RCNN_flat": self.logits = tf.contrib.layers.fully_connected(self.xx_final, config.fn_classes, activation_fn=None) print("logits:", self.logits.get_shape()) self.logits = tf.reshape(self.logits, [-1, config.fn_classes]) else: encoder_state = rnn.LSTMStateTuple(self.xx_final, self.xx_final) if config.use_att: attention_mechanism = BahdanauAttention( config.decode_size, memory=self.xx_context, memory_sequence_length=self.x_seq_length) cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=config.batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=self.first_attention) train_helper = TrainingHelper(self.yy, self.y_seq_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=self.output_l) self.decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) self.logits = self.decoder_outputs_train.rnn_output # self.logits = tf.reshape(self.logits, [-1, config.max_seq_length, config.hn_classes]) print("logits:", self.logits.get_shape()) else: cell = self.lstm train_helper = TrainingHelper(self.yy, self.y_seq_length) train_decoder = BasicDecoder(cell, train_helper, encoder_state, output_layer=self.output_l) self.decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) self.logits = self.decoder_outputs_train.rnn_output # self.logits = tf.reshape(self.logits, [-1, config.max_seq_length, config.hn_classes]) print("logits:", self.logits.get_shape())
def decoder_train(self, decoder_cell, decoder_initial_state, output_layer): ''' 创建train的decoder部分 :param encoder_outputs: encoder的输出 :param encoder_state: encoder的state :return: decoder_logits_train: decoder的predict ''' ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_input = tf.concat( [tf.fill([self.batch_size, 1], self.word_to_idx['<GO>']), ending], 1) decoder_inputs_embedded = tf.nn.embedding_lookup( self.embedding, decoder_input) training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, time_major=False, name='training_helper') training_decoder = BasicDecoder(cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) decoder_logits_train = tf.identity(decoder_outputs.rnn_output) return decoder_logits_train
def __build_decoder(self, n_decoder_layers, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id): # Use start symbols as the decoder inputs at the first time step batch_size = tf.shape(self.input_batch)[0] start_tokens = tf.fill([batch_size], start_symbol_id) ground_truth_as_input = tf.concat( [tf.expand_dims(start_tokens, 1), self.ground_truth], 1) # Use the embedding layer defined before to lookup embeddings for ground_truth_as_input self.ground_truth_embedded = tf.nn.embedding_lookup( self.embeddings, ground_truth_as_input) # Create TrainingHelper for the train stage train_helper = TrainingHelper(self.ground_truth_embedded, self.ground_truth_lengths) # Create GreedyEmbeddingHelper for the inference stage infer_helper = GreedyEmbeddingHelper(self.embeddings, start_tokens, end_symbol_id) def decode(helper, scope, reuse=None): with tf.variable_scope(scope, reuse=reuse): rnn_layers = [] for i in range(n_decoder_layers): # Create GRUCell with dropout. Do not forget to set the reuse flag properly. cell = tf.nn.rnn_cell.GRUCell(hidden_size, reuse=reuse) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=self.dropout_ph) rnn_layers.append(cell) decoder_cell = MultiRNNCell(rnn_layers) # Create a projection wrapper decoder_cell = OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse) # Create BasicDecoder, pass the defined cell, a helper, and initial state # The initial state should be equal to the final state of the encoder! initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) decoder = BasicDecoder(decoder_cell, helper, initial_state=initial_state) # The first returning argument of dynamic_decode contains two fields: # * rnn_output (predicted logits) # * sample_id (predictions) max_iters = tf.reduce_max(self.ground_truth_lengths) # max_iters = max_iter outputs, _, _ = dynamic_decode(decoder=decoder, maximum_iterations=max_iters, output_time_major=False, impute_finished=True) return outputs self.train_outputs = decode(train_helper, 'decode') self.infer_outputs = decode(infer_helper, 'decode', reuse=True)
def build_decoder(self, encoder_outputs, encoder_final_state): """ 构建完整解码器 :return: """ with tf.variable_scope("decode"): decoder_cell, decoder_initial_state = self.build_decoder_cell( encoder_outputs, encoder_final_state, self.hidden_size, self.cell_type, self.layer_size) # 输出层投影 decoder_output_projection = layers.Dense( self.decoder_vocab_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1), name='decoder_output_projection') if self.mode == 'train': # 训练模式 decoder_inputs_embdedded = tf.nn.embedding_lookup( self.decoder_embeddings, self.decoder_inputs_train) training_helper = TrainingHelper( inputs=decoder_inputs_embdedded, sequence_length=self.decoder_inputs_length, name='training_helper') training_decoder = BasicDecoder(decoder_cell, training_helper, decoder_initial_state, decoder_output_projection) max_decoder_length = tf.reduce_max(self.decoder_inputs_length) training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, maximum_iterations=max_decoder_length) self.masks = tf.sequence_mask(self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') self.loss = tf.contrib.seq2seq.sequence_loss( logits=training_decoder_output.rnn_output, targets=self.decoder_inputs, weights=self.masks, average_across_timesteps=True, average_across_batch=True) else: # 预测模式 start_token = [DataUnit.START_INDEX] * self.batch_size end_token = DataUnit.END_INDEX inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=lambda x: tf.nn.embedding_lookup( self.decoder_embeddings, x), start_tokens=start_token, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_width, output_layer=decoder_output_projection) inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, maximum_iterations=self.max_decode_step) self.decoder_pred_decode = inference_decoder_output.predicted_ids self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1])
def build_train_decoder(self): print('Building train decoder...') ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.word_to_id['<GO>']), ending], 1) decoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding, decoder_input) if self.teacher_forcing: training_helper = ScheduledEmbeddingTrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, embedding=self.embedding, sampling_probability=self.teacher_forcing_probability, time_major=False, name='teacher_forcing_training_helper' ) else: training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, time_major=False, name='training_helper' ) training_decoder = BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer ) decoder_outputs, _, _ = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length ) self.decoder_logits_train = tf.identity(decoder_outputs.rnn_output) # loss #This is the weighted cross-entropy loss for a sequence of logits. #Param: #logits: [batch_size, sequence_length, num_decoder_symbols]. # The logits is the prediction across all classes at each timestep. #targets: [batch_size, sequence_length], representing true class at each time step #weights: [batch_size, sequence_length], This is the weighting of each prediction in the sequence. self.loss = sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets, weights=self.mask ) # summary tf.summary.scalar('loss', self.loss) #Outputs a Summary protocol buffer containing a single scalar value. self.summary_op = tf.summary.merge_all() #Merges all summaries collected in the default graph. self.build_optimizer()
def decoder(x, decoder_inputs, keep_prob, sequence_length, memory, memory_length, first_attention): with tf.variable_scope("Decoder") as scope: label_embeddings = tf.get_variable(name="embeddings", shape=[n_classes, embedding_size], dtype=tf.float32) train_inputs_embedded = tf.nn.embedding_lookup(label_embeddings, decoder_inputs) lstm = rnn.LayerNormBasicLSTMCell(n_hidden, dropout_keep_prob=keep_prob) output_l = layers_core.Dense(n_classes, use_bias=True) encoder_state = rnn.LSTMStateTuple(x, x) attention_mechanism = BahdanauAttention( embedding_size, memory=memory, memory_sequence_length=memory_length) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=train_batch_size) cell_state = cell_state.clone(cell_state=encoder_state, attention=first_attention) train_helper = TrainingHelper(train_inputs_embedded, sequence_length) train_decoder = BasicDecoder(cell, train_helper, cell_state, output_layer=output_l) decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode( train_decoder, impute_finished=True) tiled_inputs = tile_batch(memory, multiplier=beam_width) tiled_sequence_length = tile_batch(memory_length, multiplier=beam_width) tiled_first_attention = tile_batch(first_attention, multiplier=beam_width) attention_mechanism = BahdanauAttention( embedding_size, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) x2 = tile_batch(x, beam_width) encoder_state2 = rnn.LSTMStateTuple(x2, x2) cell = AttentionWrapper(lstm, attention_mechanism, output_attention=False) cell_state = cell.zero_state(dtype=tf.float32, batch_size=test_batch_size * beam_width) cell_state = cell_state.clone(cell_state=encoder_state2, attention=tiled_first_attention) infer_decoder = BeamSearchDecoder(cell, embedding=label_embeddings, start_tokens=[GO] * test_len, end_token=EOS, initial_state=cell_state, beam_width=beam_width, output_layer=output_l) decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode( infer_decoder, maximum_iterations=4) return decoder_outputs_train, decoder_outputs_infer, decoder_state_infer
def training_decoding_layer(decoding_embed_input, en_len, decoding_cell, initial_state, op_layer, v_size, max_en_len): helper = TrainingHelper(inputs=decoding_embed_input, sequence_length=en_len, time_major=False) dec = BasicDecoder(decoding_cell, helper, initial_state, op_layer) logits, _, _ = dynamic_decode(dec, output_time_major=False, impute_finished=True, maximum_iterations=max_en_len) return logits
def decoder_train(self, decoder_cell, decoder_initial_state, output_layer): ''' 创建train的decoder部分 :param encoder_outputs: encoder的输出 :param encoder_state: encoder的state :return: decoder_logits_train: decoder的predict ''' # tf.strided_slice(data,begin,end,stride):对数据进行跨步切片,起始位置,截止位置,步长,各个维度对应。 # 这里对真实的输出进行batch_size长的切片操作,-1:后面在每一行最前面加了一个<GO>。 ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) # 每一行最前面加一个<GO>,tf.fill(dim,value),dim:维度,value:值。 decoder_input = tf.concat( [tf.fill([self.batch_size, 1], self.word_to_idx['<GO>']), ending], 1) # 将每一行的句子embeding。 decoder_inputs_embedded = tf.nn.embedding_lookup( self.embedding, decoder_input) # TrainingHelper:封装好的训练帮助类。训练时最常用的Helper,下一时刻的输入就是上一时刻的真实值。 # time_major:是否调换维度,时间步(即max_input_length)是否为第一维。加速训练? # False:shape(batch_size,max_input_length,embedding_size), # True:shape(max_input_length,batch_size,embedding_size) , training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, time_major=False, name='training_helper') # BasicDecoder # 参数: # cell: 一个 `RNNCell` 实例. # helper: 一个 `Helper` 实例. # initial_state: 一个 (可能组成一个tulpe)tensors 和 TensorArrays.RNNCell 的初始状态. # output_layer: (可选) 一个 `tf.layers.Layer` 实例, 例如:`tf.layers.Dense`. 应用于RNN 输出层之前的可选层,用于存储结果或者采样. # Raises:TypeError: 如果 `cell`, `helper` 或 `output_layer` 的类型不正确. training_decoder = BasicDecoder(cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) # dynamic_decode # 参数: # decoder: BasicDecoder、BeamSearchDecoder或者自己定义的decoder类对象 # output_time_major: 见RNN,为真时step*batch_size*...,为假时batch_size*step*... # impute_finished: Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。 # maximum_iterations: 最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生<eos>或者到达最大步数处停止。 decoder_outputs, _, _ = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) # TODO:identity作用? decoder_logits_train = tf.identity(decoder_outputs.rnn_output) return decoder_logits_train
def build_train_decoder(self): print('Building train decoder...') ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.word_to_id['<GO>']), ending], 1) decoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding, decoder_input) if self.teacher_forcing: training_helper = ScheduledEmbeddingTrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, embedding=self.embedding, sampling_probability=self.teacher_forcing_probability, time_major=False, name='teacher_forcing_training_helper' ) else: training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, time_major=False, name='training_helper' ) training_decoder = BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer ) decoder_outputs, _, _ = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length ) decoder_logits_train = tf.identity(decoder_outputs.rnn_output) # loss self.loss = sequence_loss( logits=decoder_logits_train, targets=self.decoder_targets, weights=self.mask ) # summary tf.summary.scalar('loss', self.loss) self.summary_op = tf.summary.merge_all() self.writer = tf.summary.FileWriter('log/train', self.sess.graph) self.build_optimizer()
def build_train_decoder(self, decoder_targets, decoder_targets_length, max_target_sequence_length, mask, name): ending = tf.strided_slice(decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_input = tf.concat([tf.fill([self.batch_size, 1], self.word_to_id['<GO>']), ending], 1) decoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding, decoder_input) decoder_cell, deocder_initial_state = self.build_decoder_cell() output_layer = tf.layers.Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1) ) if self.teacher_forcing: training_helper = ScheduledEmbeddingTrainingHelper( inputs=decoder_inputs_embedded, sequence_length=decoder_targets_length, embedding=self.embedding, sampling_probability=self.teacher_forcing_probility, time_major=False, name='teacher_forcing_training_helper_' + name ) else: training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=decoder_targets_length, time_major=False, name='training_helper_' + name ) training_decoder = BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=deocder_initial_state, output_layer=output_layer ) decoder_outputs, _, _ = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length ) decoder_logits_train = tf.identity(decoder_outputs.rnn_output) # loss loss = sequence_loss( logits=decoder_logits_train, targets=decoder_targets, weights=mask ) return loss
def _init(self, sequence, targets, authors): batch_size = tf.shape(sequence)[0] sequence_lengths = tf.cast(tf.count_nonzero(sequence, axis=1), tf.int32) embedding = tf.Variable( tf.random_normal((self._vocab_size, self._embed_size)), name='char_embedding' ) context = tf.Variable( tf.random_normal((self._author_size, self._ctx_size)), name='ctx_embedding' ) embedded_sequence = tf.nn.embedding_lookup(embedding, sequence) embedded_authors = tf.nn.embedding_lookup(context, authors) gpu = lambda x: '/gpu:{}'.format(x % self._num_gpu) if self._training: dropout = lambda x: DropoutWrapper( x, 1.0-self._input_dropout, 1.0-self._output_dropout) helper = TrainingHelper(embedded_sequence, sequence_lengths) else: dropout = lambda x: x helper = SampleEmbeddingHelper(embedding, sequence[:,0], 2) base = lambda x: ContextWrapper(self._cell(x), embedded_authors) wrap = lambda i, cell: DeviceWrapper(dropout(cell), gpu(i)) cells = [wrap(i, base(self._cell_size)) for i in range(self._cell_num)] cell = MultiRNNCell(cells) init_state = cell.zero_state(batch_size, tf.float32) dense = tf.layers.Dense( self._vocab_size, self._activation, name='fully_connected' ) decoder = BasicDecoder(cell, helper, init_state, dense) output, _, _ = dynamic_decode(decoder, swap_memory=True) logits = output.rnn_output weights = tf.sequence_mask(sequence_lengths, dtype=tf.float32) loss = tf.contrib.seq2seq.sequence_loss( logits, targets, weights ) out = output.sample_id return targets, loss, out
def decoder(self, encoder_outputs, encoder_states): decoder_cell, decoder_init_state = self.add_decoder_cell( encoder_outputs, encoder_states, self.hidden_size, self.cell_type, self.num_layers) output_proj = tf.layers.Dense( self.tgt_vcb_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), name='output_proj') if self.mode == 'train': target_embedding = tf.nn.embedding_lookup(self.decoder_embeddings, self.decoder_input_train) training_helper = TrainingHelper(target_embedding, self.target_len, name='training_helper') training_decoder = BasicDecoder(decoder_cell, training_helper, decoder_init_state, output_proj) max_dec_len = tf.reduce_max(self.target_len) output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, maximum_iterations=max_dec_len) self.d_masks = tf.sequence_mask(self.target_len, max_dec_len, dtype=tf.float32, name='d_masks') self.prob = output.rnn_output self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.prob, targets=self.target, weights=self.d_masks, average_across_timesteps=True, average_across_batch=True) else: start_token = [DataUnit.START_INDEX] * self.batch_size end_token = DataUnit.END_INDEX inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=lambda x: tf.nn.embedding_lookup( self.decoder_embeddings, x), start_tokens=start_token, end_token=end_token, initial_state=decoder_init_state, beam_width=self.beam_size, output_layer=output_proj) output, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, maximum_iterations=self.max_decode_step) output_pred_ = output.predicted_ids self.decoder_output = tf.transpose(output_pred_, perm=[0, 2, 1])
def attention_alignment(inputs, input_lengths, memory, memory_lengths, n_layers, n_units, dropout_prob, cell_type=GRUCell, attention_mechanism=BahdanauAttention, is_training=True): """Performs alignment over inputs, attending over memory Args: inputs (tensor): Input sequence, with the shape of [Batch x seq_length x dimension] input_lengths (tensor): The length of input sequences. Used for dynamic unrolling memory (tensor): Sequence to attend memory_lengths (tensor): The length of memory. Used for dynamic unrolling n_layers (int): Number of layers in RNN n_units (int): Number of units in RNN dropout_prob (float): Drop out rate for RNN cell cell_type (method): Type of RNN cell, GRU by default attention_mechanism (method): Type of attention mechanism, Bahdanau by default is_training (bool): Whether the model is training or testing returns: (tensor, tensor, tensor): """ # get tensor dimensions batch_size, seq_length, dim = inputs.get_shape().as_list() # create a attention over the memory attention = attention_mechanism(n_units, memory, memory_sequence_length=memory_lengths, dtype=tf.float32) # build an encoder RNN over the input sequence dropout_prob = 0 if not is_training else dropout_prob if n_layers > 1: attention_cell = MultiRNNCell([DropoutWrapper(cell_type(n_units), output_keep_prob=1-dropout_prob) for _ in range(n_layers)]) else: attention_cell = cell_type(n_units) attention_cell = DropoutWrapper(attention_cell, output_keep_prob=1-dropout_prob) # for each input to the next RNN cell, wire the attention mechanism a_cell = AttentionWrapper(attention_cell, attention, alignment_history=True) # define the initial state # TODO: Do we ever feed an init state? attention_state = a_cell.zero_state(batch_size, dtype=tf.float32) # read input while attending over memory helper = TrainingHelper(inputs=inputs, sequence_length=input_lengths) decoder = BasicDecoder(a_cell, helper, attention_state) # output of the decoder is a new representation of input sentence with attention over the question outputs, states, _ = tf.contrib.seq2seq.dynamic_decode(decoder, maximum_iterations=seq_length, impute_finished=True) outputs = tf.pad(outputs.rnn_output, [[0, 0], [0, seq_length - tf.reduce_max(input_lengths)], [0, 0]]) outputs = tf.reshape(outputs, [batch_size, seq_length, dim]) # attention matrix for visualizing heatmap aligned = tf.transpose(states.alignment_history.stack(), [1, 0, 2]) return outputs, states, aligned
def decode(self, cell_dec, enc_final_state, output_size, output_embed_matrix, training, grammar_helper=None): linear_layer = tf_core_layers.Dense(output_size, use_bias=False) go_vector = tf.ones((self.batch_size,), dtype=tf.int32) * self.config.grammar.start if training: output_ids_with_go = tf.concat([tf.expand_dims(go_vector, axis=1), self.output_placeholder], axis=1) outputs = tf.nn.embedding_lookup([output_embed_matrix], output_ids_with_go) helper = TrainingHelper(outputs, self.output_length_placeholder+1) else: helper = GreedyEmbeddingHelper(output_embed_matrix, go_vector, self.config.grammar.end) if self.config.use_grammar_constraints: decoder = GrammarBasicDecoder(self.config.grammar, cell_dec, helper, enc_final_state, output_layer = linear_layer, training_output = self.output_placeholder if training else None, grammar_helper=grammar_helper) else: decoder = BasicDecoder(cell_dec, helper, enc_final_state, output_layer = linear_layer) final_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=self.max_length) return final_outputs
def training_decoding_layer(decoding_embed_input, en_len, decoding_cell, encoding_op, encoding_st, op_layer, v_size, fr_len, max_en_len): with variable_scope.variable_scope( "decoder", initializer=init_ops.constant_initializer(0.1)) as vs: print("args:", args) if (args.attention_architecture is not None): decoding_cell, encoding_st = create_attention( decoding_cell, encoding_op, encoding_st, fr_len) helper = TrainingHelper(inputs=decoding_embed_input, sequence_length=en_len, time_major=False) dec = BasicDecoder(decoding_cell, helper, encoding_st, op_layer) logits, _, _ = dynamic_decode(dec, output_time_major=False, impute_finished=True, maximum_iterations=max_en_len) return logits
def decoder_ops(self, decoder_emb_inp, encoder_outputs, encoder_state, hparams): """ :param decoder_emb_inp: :return: """ decoder_cell = self._build_cell(self.cell_type, self.num_units, self.num_layers) helper = TrainingHelper(decoder_emb_inp, self.target_seq_length, time_major=True) decoder = BasicDecoder(decoder_cell, helper, encoder_state, output_layer=project_layer) # 动态 decoding outputs, _ = dynamic_decode((decoder)) logits = outputs.rnn_output core.Dense()
def build_mmi_decoder(self): with tf.name_scope('mmi_scorer'): training_helper = TrainingHelper( inputs=self.inputs_dense, sequence_length=self.inputs_length, time_major=False, name='mmi_training_helper') with tf.name_scope('mmi_basic_decoder'): training_decoder = MMIDecoder(cell=self.cell, helper=training_helper, initial_state=self.initial_state, output_layer=self.output_layer) with tf.name_scope('mmi_dynamic_decoder'): (outputs, self.last_state, self.outputs_length) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.inputs_max_length) self.scores_raw = tf.identity( tf.transpose(outputs.scores, [1, 2, 0])) targets = self.features["targets"] targets = tf.cast(targets, dtype=tf.int32) target_len = tf.cast(tf.count_nonzero( targets - self.vocab.end_token_id, -1), dtype=tf.int32) max_target_len = tf.reduce_max(target_len) pruned_targets = tf.slice(targets, [0, 0], [-1, max_target_len]) index = (tf.range(0, max_target_len, 1)) * \ tf.ones(shape=[self.batch_size, 1], dtype=tf.int32) row_no = tf.transpose( tf.range(0, self.batch_size, 1) * tf.ones(shape=(max_target_len, 1), dtype=tf.int32)) indices = tf.stack([index, pruned_targets, row_no], axis=2) # Retrieve scores corresponding to indices batch_scores = tf.gather_nd(self.scores_raw, indices) self.mmi_scores = tf.reduce_sum(batch_scores, axis=1)
def build_train_decoder(self): with tf.name_scope('train_decoder'): training_helper = TrainingHelper( inputs=self.inputs_dense, sequence_length=self.inputs_length, time_major=False, name='training_helper') with tf.name_scope('basic_decoder'): training_decoder = BasicDecoder( cell=self.cell, helper=training_helper, initial_state=self.initial_state, output_layer=self.output_layer) with tf.name_scope('dynamic_decode'): (outputs, self.last_state, self.outputs_length) = (seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.inputs_max_length)) self.logits = tf.identity(outputs.rnn_output) self.log_probs = tf.nn.log_softmax(self.logits) self.gs_hypotheses = tf.argmax(self.log_probs, -1)
def model_fn(features, labels, mode, params): embedding_encoder = tf.get_variable('embedding_encoder', shape=(params.vocab_size, params.emb_size)) table = lookup_ops.index_to_string_table_from_file(params.word_vocab_file) question_emb = tf.nn.embedding_lookup(embedding_encoder, features['question_words']) passage_emb = tf.nn.embedding_lookup(embedding_encoder, features['passage_words']) question_words_length = features['question_length'] passage_words_length = features['passage_length'] answer_start, answer_end = features['answer_start'], features['answer_end'] answer_start = tf.concat([tf.expand_dims(answer_start, -1)] * 50, -1) answer_end = tf.concat([tf.expand_dims(answer_end, -1)] * 50, -1) with tf.variable_scope('passage_encoding'): passage_enc, (_, passage_bw_state) = biGRU(tf.concat( [passage_emb, answer_start, answer_end], -1), passage_words_length, params, layers=params.layers) with tf.variable_scope('question_encoding'): question_enc, (_, question_bw_state) = biGRU(question_emb, question_words_length, params, layers=params.layers) # output_enc = masked_concat(question_enc, passage_enc, question_words_length, passage_words_length) decoder_state_layer = Dense(params.units, activation=tf.tanh, use_bias=True, name='decoder_state_init') decoder_init_state = tuple( decoder_state_layer( tf.concat([passage_bw_state[i], question_bw_state[i]], -1)) for i in range(params.layers)) question_att = BahdanauAttention( params.units, question_enc, memory_sequence_length=question_words_length) passage_att = BahdanauAttention( params.units, passage_enc, memory_sequence_length=passage_words_length) decoder_cell = AttentionWrapper(MultiRNNCell( [GRUCell(params.units) for _ in range(params.layers)]), [question_att, passage_att], initial_cell_state=decoder_init_state) batch_size = params.batch_size # if mode != tf.estimator.ModeKeys.PREDICT else 1 if mode == tf.estimator.ModeKeys.TRAIN: answer_emb = tf.nn.embedding_lookup(embedding_encoder, features['answer_words']) helper = TrainingHelper(answer_emb, features['answer_length']) else: helper = GreedyEmbeddingHelper( embedding_encoder, tf.fill([batch_size], params.tgt_sos_id), params.tgt_eos_id) projection_layer = Dense(params.vocab_size, use_bias=False) decoder = SNetDecoder(decoder_cell, helper, decoder_cell.zero_state(batch_size, tf.float32), output_layer=projection_layer, params=params) outputs, _, outputs_length = dynamic_decode( decoder, maximum_iterations=params.answer_max_words) logits = outputs.rnn_output if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'answer': table.lookup(tf.cast(outputs.sample_id, tf.int64)) } export_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) # logits = tf.Print(logits, [outputs.sample_id, labels], summarize=1000) labels = tf.stop_gradient(labels[:, :tf.reduce_max(outputs_length)]) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) target_weights = tf.sequence_mask(outputs_length, dtype=logits.dtype) loss = tf.reduce_sum(crossent * target_weights) / params.batch_size if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdadeltaOptimizer(learning_rate=1) global_step = tf.train.get_or_create_global_step() grads = optimizer.compute_gradients(loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip) train_op = optimizer.apply_gradients(zip(capped_grads, variables), global_step=global_step) return EstimatorSpec( mode, loss=loss, train_op=train_op, ) if mode == tf.estimator.ModeKeys.EVAL: return EstimatorSpec(mode, loss=loss, eval_metric_ops={ 'rouge-l': rouge_l(outputs.sample_id, labels, outputs_length, features['answer_length'], params, table), })
def build_model(self): print('building model... ...') with tf.variable_scope('seq2seq_placeholder'): self.encoder_inputs = tf.placeholder(tf.int32, [None, None], name="encoder_inputs") self.decoder_inputs = tf.placeholder(tf.int32, [None, None], name="decoder_inputs") self.decoder_targets = tf.placeholder(tf.int32, [None, None], name="decoder_targets") self.decoder_targets_masks = tf.placeholder(tf.bool, [None, None], name="mask") self.encoder_length = tf.placeholder(tf.int32, [None], name="encoder_length") self.decoder_length = tf.placeholder(tf.int32, [None], name="decoder_length") # ECM placeholder self.choice_qs = tf.placeholder(tf.float32, [None, None], name="choice") self.emo_cat = tf.placeholder(tf.int32, [None], name="emotion_category") self.max_target_sequence_length = tf.reduce_max( self.decoder_length, name='max_target_len') with tf.variable_scope('seq2seq_embedding'): self.embedding = self.init_embedding(self.vocab_size, self.embedding_size) # create emotion category embeddings emo_initializer = tf.contrib.layers.xavier_initializer() emo_cat_embeddings = tf.get_variable( "emo_cat_embeddings", [self.num_emotion, self.emo_cat_emb_size], initializer=emo_initializer, dtype=tf.float32) self.emo_internal_memory_embedding = tf.get_variable( "emo_internal_memory_embedding", [self.num_emotion, self.emo_internal_memory_units], initializer=emo_initializer, dtype=tf.float32) self.emo_cat_embs = tf.nn.embedding_lookup(emo_cat_embeddings, self.emo_cat) with tf.variable_scope('seq2seq_encoder'): encoder_outputs, encoder_states = build_encoder( self.embedding, self.encoder_inputs, self.encoder_length, self.enc_num_layers, self.enc_num_units, self.enc_cell_type, bidir=self.enc_bidir) with tf.variable_scope('seq2seq_decoder'): encoder_length = self.encoder_length emo_cat = self.emo_cat emo_cat_embs = self.emo_cat_embs if self.beam_search: print("use beamsearch decoding..") encoder_outputs = tile_batch(encoder_outputs, multiplier=self.beam_size) encoder_states = tile_batch(encoder_states, multiplier=self.beam_size) encoder_length = tile_batch(encoder_length, multiplier=self.beam_size) emo_cat = tile_batch(emo_cat, multiplier=self.beam_size) emo_cat_embs = tile_batch(emo_cat_embs, multiplier=self.beam_size) attention_mechanism = BahdanauAttention( num_units=self.attn_num_units, memory=encoder_outputs, memory_sequence_length=encoder_length) decoder_cell = create_rnn_cell(self.dec_num_layers, self.dec_num_units, self.dec_cell_type) self.read_g = tf.layers.Dense(self.emo_internal_memory_units, use_bias=False, name="internal_read_gate") self.write_g = tf.layers.Dense(self.emo_internal_memory_units, use_bias=False, name="internal_write_gate") decoder_cell = ECMWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, emo_cat_embs=emo_cat_embs, # emotion category embedding emo_cat=emo_cat, # emotion category emo_internal_memory_units=self. emo_internal_memory_units, # emotion memory size emo_internal_memory_embedding=self. emo_internal_memory_embedding, # num of emotions read_gate=self.read_g, write_gate=self.write_g, attention_layer_size=self.dec_num_units, name='ECMWrapper') batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_states) output_layer = tf.layers.Dense( self.vocab_size, use_bias=False, name='output_projection') # 普通词典projection # ECM external memory module emo_output_layer = tf.layers.Dense( self.vocab_size, use_bias=False, name="emo_output_projection") # 情感词典projection emo_choice_layer = tf.layers.Dense( 1, use_bias=False, name="emo_choice_alpha") # 选择情感词概率projection if self.mode == 'train': decoder_inputs_embedded = tf.nn.embedding_lookup( self.embedding, self.decoder_inputs) # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出 training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_length, name='training_helper') training_decoder = BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state) self.decoder_outputs, self.final_state, self.final_sequence_length = dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) self.decoder_logits_train = tf.identity( self.decoder_outputs.rnn_output) with tf.variable_scope('decoder'): self.generic_logits = output_layer( self.decoder_logits_train) # 得到普通词的概率分布logits self.emo_ext_logits = emo_output_layer( self.decoder_logits_train) # 得到情感词的概率分布logits self.alphas = tf.nn.sigmoid( emo_choice_layer( self.decoder_logits_train)) # 得到选择情感词的概率 self.int_M_emo = self.final_state.internal_memory # internal_memory的最终状态 g_probs = tf.nn.softmax( self.generic_logits) * (1 - self.alphas) e_probs = tf.nn.softmax(self.emo_ext_logits) * self.alphas train_log_probs = tf.log(g_probs + e_probs) # compute losses self.alphas = tf.squeeze(self.alphas, axis=-1) self.g_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.generic_logits, labels=self.decoder_targets) - tf.log(1 - self.alphas) self.e_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.emo_ext_logits, labels=self.decoder_targets) - tf.log(self.alphas) losses = self.g_losses * ( 1 - self.choice_qs) + self.e_losses * self.choice_qs # alpha and internal memory regularizations self.alpha_reg = tf.reduce_mean(self.choice_qs * -tf.log(self.alphas)) self.int_mem_reg = tf.reduce_mean( tf.norm(self.int_M_emo + 1e-7, axis=1)) losses = tf.boolean_mask(losses, self.decoder_targets_masks) self.loss = tf.reduce_mean( losses) + self.alpha_reg + self.int_mem_reg # prepare for perlexity computations CE = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_log_probs, labels=self.decoder_targets) CE = tf.boolean_mask(CE, self.decoder_targets_masks) self.CE = tf.reduce_mean(CE) optimizer = tf.train.AdamOptimizer(self.learning_rate) trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'infer': start_tokens = tf.ones([ self.batch_size, ], tf.int32) * SOS_ID end_token = EOS_ID inference_decoder = ECMBeamSearchDecoder( cell=decoder_cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer, emo_output_layer=emo_output_layer, emo_choice_layer=emo_choice_layer) decoder_outputs, _, _ = dynamic_decode( decoder=inference_decoder, maximum_iterations=self.infer_max_iter) infer_outputs = decoder_outputs.predicted_ids # [batch_size, decoder_targets_length, beam_size] self.infer_outputs = tf.transpose( infer_outputs, [0, 2, 1], name='infer_outputs' ) # [batch_size, beam_size, decoder_targets_length] self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.max_to_keep)
def construct(self): self.saved_session_name = os.path.join(self.tmp_folder, self.uuid_code) self.input_data = tf.placeholder(tf.float32, [None, None, self.input_dim]) self.output_data = tf.placeholder(tf.float32, [None, None, self.output_dim]) self.start_tokens = tf.placeholder(tf.float32, [None, self.output_dim]) self.go_tokens = tf.placeholder(tf.float32, [None, 1, self.output_dim]) self.sequence_length = tf.placeholder(tf.int32, [None]) self.mask = tf.placeholder(tf.float32, [None, None]) self.target_sequence_length = tf.placeholder( tf.int32, (None, ), name='target_sequence_length') self.max_target_sequence_length = tf.reduce_max( self.target_sequence_length, name='max_target_len') self.source_sequence_length = tf.placeholder( tf.int32, (None, ), name='source_sequence_length') self.x_stopping = np.full((self.stop_pad_length, self.input_dim), self.stop_pad_token, dtype=np.float32) self.y_stopping = np.full((self.stop_pad_length, self.output_dim), self.stop_pad_token, dtype=np.float32) self.learning_rate = tf.placeholder(tf.float32) self.batch_size = tf.placeholder(tf.float32) enc_cell = make_cell(self.layer_sizes, self.keep_prob) # We want to train the decoder to learn the stopping point as well, # so the sequence lengths is extended for both the decoder and the encoder # logic: the encoder will learn that the stopping token is the signal that the input is finished # the decoder will learn to produce the stopping token to match the expected output # the inferer will learn to produce the stopping token for us to recognise that and stop inferring self.source_sequence_length_padded = self.source_sequence_length + self.stop_pad_length self.target_sequence_length_padded = self.target_sequence_length + self.stop_pad_length max_target_sequence_length_padded = self.max_target_sequence_length + self.stop_pad_length _, self.enc_state = dynamic_rnn( enc_cell, self.input_data, sequence_length=self.source_sequence_length_padded, dtype=tf.float32, time_major=False, swap_memory=True) self.enc_state_centre = self.enc_state[-1] if self.symmetric: self.enc_state = self.enc_state[::-1] dec_cell = make_cell(self.layer_sizes[::-1], self.keep_prob) else: dec_cell = make_cell(self.layer_sizes, self.keep_prob) # 3. Dense layer to translate the decoder's output at each time # step into a choice from the target vocabulary projection_layer = tf.layers.Dense( units=self.output_dim, # kernel_initializer=tf.initializers.he_normal(), # kernel_regularizer=regularizer, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) # 4. Set up a training decoder and an inference decoder # Training Decoder with tf.variable_scope("decode"): # During PREDICT mode, the output data is none so we can't have a training model. # Helper for the training process. Used by BasicDecoder to read inputs. dec_input = tf.concat([self.go_tokens, self.output_data], 1) training_helper = TrainingHelper( inputs=dec_input, sequence_length=self.target_sequence_length_padded, time_major=False) # Basic decoder training_decoder = BasicDecoder(dec_cell, training_helper, self.enc_state, projection_layer) # Perform dynamic decoding using the decoder self.training_decoder_output\ = dynamic_decode(training_decoder, # True because we're using variable length sequences, which have finish points impute_finished=True, maximum_iterations=max_target_sequence_length_padded)[0] # 5. Inference Decoder # Reuses the same parameters trained by the training process with tf.variable_scope("decode", reuse=True): def end_fn(time_step_value): # Ideally, the inferer should produce the stopping token # Which can be assessed as being equal to the modelled stop token, and this should be return: # return tf.reduce_all(tf.equal(time_step_value, self.y_stopping)) # However due to the nature of training, the produced stop token will never be exactly the same # as the modelled one. If we use an embedded layer, then this top token can be learned # however as we are not using the embedded layer, this function should return False # meaning there is no early stop return False inference_helper = InferenceHelper(sample_fn=lambda x: x, sample_shape=[self.output_dim], sample_dtype=dtypes.float32, start_inputs=self.start_tokens, end_fn=end_fn) # Basic decoder inference_decoder = BasicDecoder(dec_cell, inference_helper, self.enc_state, projection_layer) # Perform dynamic decoding using the decoder self.inference_decoder_output = dynamic_decode( inference_decoder, # True because we're using variable length sequences, which have finish points impute_finished=True, maximum_iterations=max_target_sequence_length_padded)[0]
def __init__(self, vocab_size, learning_rate, encoder_size, max_length, embedding_size, sos_token, eos_token, unk_token, beam_size=5): self.vocab_size = vocab_size self.lr = learning_rate self.encoder_size = encoder_size self.max_length = max_length self.embedding_size = embedding_size self.SOS_token = sos_token self.EOS_token = eos_token self.UNK_token = unk_token self.beam_search_size = beam_size with tf.variable_scope('placeholder_and_embedding'): self.query = tf.placeholder(shape=(None, None), dtype=tf.int32) self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.reply = tf.placeholder(shape=(None, None), dtype=tf.int32) self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32) self.decoder_target = tf.placeholder(shape=(None, None), dtype=tf.int32) self.decoder_length = tf.placeholder(shape=(None, ), dtype=tf.int32) self.batch_size = tf.placeholder(shape=(), dtype=tf.int32) self.embedding_pl = tf.placeholder(dtype=tf.float32, shape=(self.vocab_size, embedding_size), name='embedding_source_pl') word_embedding = tf.get_variable(name='word_embedding', shape=(self.vocab_size, embedding_size), dtype=tf.float32, trainable=True) self.init_embedding = word_embedding.assign(self.embedding_pl) self.max_target_sequence_length = tf.reduce_max( self.decoder_length, name='max_target_len') self.mask = tf.sequence_mask(self.decoder_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') with tf.variable_scope("query_encoder"): self.query_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) query_out, query_state = self.query_encoder( seq_index=self.query, seq_len=self.query_length) with tf.variable_scope("reply_encoder"): self.reply_encoder = deep_components.gru_encoder( word_embedding, self.encoder_size) reply_out, reply_state = self.reply_encoder( seq_index=self.reply, seq_len=self.reply_length) with tf.variable_scope("decoder"): combined_encoder_state = tf.concat([query_state, reply_state], axis=1) tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( combined_encoder_state, multiplier=self.beam_search_size) tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( query_out, multiplier=self.beam_search_size) tiled_sequence_length = tf.contrib.seq2seq.tile_batch( self.query_length, multiplier=self.beam_search_size) decoder_cell = deep_components.AttentionGRUCell( memory=tiled_encoder_outputs, memory_size=self.encoder_size, attention_size=self.encoder_size, embedding_dims=self.embedding_size, rnn_units=self.encoder_size * 2) '''decoder_gru = GRUCell(self.encoder_size * 2) attention_mechanism = BahdanauAttention( num_units=self.encoder_size, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) attention_cell = AttentionWrapper(decoder_gru, attention_mechanism, attention_layer_size=self.encoder_size) decoder_initial_state_beam = attention_cell.zero_state( dtype=tf.float32, batch_size=tf.cast(self.batch_size * self.beam_search_size,dtype=tf.int32)).clone( cell_state=tiled_encoder_final_state)''' ############################# #attention_cell=decoder_gru #decoder_initial_state_beam = tiled_encoder_final_state ############################## decode_out_layer = tf.layers.Dense(self.vocab_size, name='output_layer', _reuse=tf.AUTO_REUSE) with tf.variable_scope("seq2seq-train"): # train self.tiled_d_in = tile_batch(self.decoder_inputs, multiplier=self.beam_search_size) self.tiled_d_tgt = tile_batch(self.decoder_target, multiplier=self.beam_search_size) train_helper = TrainingHelper( tf.contrib.seq2seq.tile_batch( tf.nn.embedding_lookup(word_embedding, self.decoder_inputs), multiplier=self.beam_search_size), sequence_length=tile_batch(self.decoder_length, multiplier=self.beam_search_size), name="train_helper") train_decoder = BasicDecoder( decoder_cell, train_helper, initial_state=tiled_encoder_final_state, output_layer=decode_out_layer) self.dec_output, _, self.gen_len = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) #self.gen_max_len=tf.reduce_max(self.gen_len) #self.padding=tf.zeros(shape=(self.batch_size,self.max_length-self.gen_max_len,self.vocab_size),dtype=tf.float32) #self.padding=tile_batch(self.padding,multiplier=self.beam_search_size) self.dec_logits = tf.identity(self.dec_output.rnn_output) #self.dec_logits = tf.concat((self.dec_logits,self.padding),axis=1) self.decoder_target_mask = tile_batch( self.mask, multiplier=self.beam_search_size) self.cost = sequence_loss( self.dec_logits, tile_batch(self.decoder_target, multiplier=self.beam_search_size), self.decoder_target_mask) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(self.cost) with tf.variable_scope("seq2seq_beam_search_generate"): start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.SOS_token beam_infer_decoder = BeamSearchDecoder( decoder_cell, embedding=word_embedding, end_token=self.EOS_token, start_tokens=start_tokens, initial_state=tiled_encoder_final_state, beam_width=self.beam_search_size, output_layer=decode_out_layer) self.bs_outputs, _, _ = dynamic_decode( beam_infer_decoder, maximum_iterations=self.max_length) with tf.variable_scope("greedy_generate"): decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=word_embedding, start_tokens=start_tokens, end_token=self.EOS_token) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=tiled_encoder_final_state, output_layer=decode_out_layer) self.greedy_outputs, _, _ = dynamic_decode( inference_decoder, maximum_iterations=self.max_length)
def build_decoder(self, phase): print("building decoder and attention..") with tf.variable_scope('decoder'): # Building decoder_cell and decoder_initial_state decoder_cells, decoder_initial_state = self.build_decoder_cell() # Initialize decoder embeddings to have variance=1. initializer = tf.random_uniform_initializer(-sqrt(3), sqrt(3), dtype=tf.float32) self.decoder_embeddings = tf.get_variable( name='embedding', shape=(self.config.decoder_symbols_num, self.config.embedding_size), initializer=initializer, dtype=tf.float32) # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims input_layer = Dense(self.config.hidden_units, dtype=tf.float32, name='input_projection') # Output projection layer to convert cell_outputs to logits output_layer = Dense(self.config.decoder_symbols_num, name='output_projection') if phase == 'train': # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] decoder_inputs_embedded = embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer decoder_inputs_embedded = input_layer(decoder_inputs_embedded) # Helper to feed inputs for training: read inputs from dense ground truth vectors training_helper = TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_inputs_length_train, time_major=False, name='training_helper') training_decoder = BasicDecoder( cell=decoder_cells, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_train.sample_id: [batch_size], tf.int32 self.decoder_outputs_train, self.decoder_last_state_train, \ self.decoder_outputs_length_train = dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) # More efficient to do the projection on the batch-time-concatenated tensor # logits_train: (batch_size, max_time_step + 1, num_decoder_symbols) # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) self.decoder_logits_train = tf.identity( self.decoder_outputs_train.rnn_output) # Use argmax to extract decoder symbols to emit self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, name='decoder_pred_train') # masks: masking for valid and padded time steps, (batch_size, max_time_step + 1) masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # Computes per word average cross-entropy over a batch # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default self.loss = sequence_loss(logits=self.decoder_logits_train, targets=self.decoder_targets_train, weights=masks, average_across_timesteps=True, average_across_batch=True) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) # Contruct graphs for minimizing loss self.build_optimizer() elif phase == 'decode': # Start_tokens: [batch_size,] `int32` vector start_tokens = tf.ones( (self.batch_size, ), tf.int32) * self.config._GO end_token = self.config._EOS def embed_and_input_proj(inputs): return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) # Helper to feed inputs for greedy decoding: uses the argmax of the output decoding_helper = GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step inference_decoder = BasicDecoder( cell=decoder_cells, helper=decoding_helper, initial_state=decoder_initial_state, output_layer=output_layer) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] if output_time_major=False # [max_time_step, batch_size, num_decoder_symbols] if output_time_major=True # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32 if output_time_major=False # [max_time_step, batch_size], tf.int32 if output_time_major=True # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False # [max_time_step, batch_size, beam_width] if output_time_major=True # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) self.decoder_outputs_decode, self.decoder_last_state_decode, \ self.decoder_outputs_length_decode = dynamic_decode( decoder=inference_decoder, output_time_major=False, # impute_finished=True, # error occurs?? maximum_iterations=self.config.max_decode_step) # decoder_outputs_decode.sample_id: [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with the result of the beamsearch decoder # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) self.decoder_pred_decode = tf.expand_dims( self.decoder_outputs_decode.sample_id, -1)
def build_train_graph(self, train_examples): """ Building train graph with train examples :param train_examples: Examples from train data :return: Predicted outputs, parameters of generator, decoder cell, attention zero state """ # Unpack subject, content and answers and corresponding length subject, len_subject, content, len_content, target_input, target_output, len_target = train_examples # Choose best answer per question target_input = target_input[:, 0, :] target_output = target_output[:, 0, :] len_target = tf.to_int32(len_target[:, 0]) # Look up word vectors for decoder input decoder_inputs_embedded = tf.nn.embedding_lookup( self.embeddings_english, target_input) # Concat subject and content to feed it into encoder sub_cont_concat_op = tf.map_fn( self.concat_seqs, [subject, len_subject, content, len_content])[0] len_both = len_subject + len_content # Load inital graph twice, one for train and another for output with beam decoder decoder_cell, attn_zero_state = self.build_initial_graph( sub_cont_concat_op, len_both) decoder_cell_beam, attn_zero_state_beam = self.build_initial_graph( sub_cont_concat_op, len_both, reuse=True, beam_width=self.beam_width) # Make train decoder helper = TrainingHelper(decoder_inputs_embedded, len_target, time_major=False) decoder = BasicDecoder(decoder_cell, helper, attn_zero_state, output_layer=self.projection_layer) # Make beam search decoder beam_search_decoder = BeamSearchDecoder( decoder_cell_beam, self.embeddings_english, self.start_token, self.end_token, attn_zero_state_beam, self.beam_width, output_layer=self.projection_layer) # Define variable scope train decoder to initialize the train decoder and beam search decoder # with dynamic decode with tf.variable_scope("train_decoder"): final_outputs, final_state, final_seq_len = dynamic_decode( decoder, output_time_major=False) with tf.variable_scope("train_decoder", reuse=True): beam_outputs, _, beam_out_len = dynamic_decode( beam_search_decoder, output_time_major=False, maximum_iterations=self.max_seq_len) # Output of train decoder final_outputs_max_len = tf.shape(final_outputs.sample_id)[1] target_output = target_output[:, :final_outputs_max_len] # Output of beam search decoder beam_outputs = tf.transpose(beam_outputs.predicted_ids, [2, 0, 1]) beam_outputs = tf.reshape(beam_outputs[0, :, :], [self.batch_size, -1]) beam_out_len = tf.transpose(beam_out_len) beam_out_len = tf.reshape(beam_out_len[0, :], [-1]) # Get generator parameters generator_params = [ param for param in tf.trainable_variables() if "discriminator" not in param.name ] return target_output, final_outputs, final_seq_len, generator_params, decoder_cell, attn_zero_state, beam_outputs, beam_out_len
def _build_model(self): with tf.variable_scope("embeddings"): self.source_embs = tf.get_variable( name="source_embs", shape=[self.cfg.source_vocab_size, self.cfg.emb_dim], dtype=tf.float32, trainable=True) self.target_embs = tf.get_variable( name="embeddings", shape=[self.cfg.vocab_size, self.cfg.emb_dim], dtype=tf.float32, trainable=True) source_emb = tf.nn.embedding_lookup(self.source_embs, self.enc_source) target_emb = tf.nn.embedding_lookup(self.target_embs, self.dec_target_in) print("source embedding shape: {}".format( source_emb.get_shape().as_list())) print("target input embedding shape: {}".format( target_emb.get_shape().as_list())) with tf.variable_scope("encoder"): if self.cfg.use_bi_rnn: with tf.variable_scope("bi-directional_rnn"): cell_fw = GRUCell(self.cfg.num_units) if self.cfg.cell_type == "gru" else \ LSTMCell(self.cfg.num_units) cell_bw = GRUCell(self.cfg.num_units) if self.cfg.cell_type == "gru" else \ LSTMCell(self.cfg.num_units) bi_outputs, _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, source_emb, dtype=tf.float32, sequence_length=self.enc_seq_len) source_emb = tf.concat(bi_outputs, axis=-1) print("bi-directional rnn output shape: {}".format( source_emb.get_shape().as_list())) input_project = tf.layers.Dense(units=self.cfg.num_units, dtype=tf.float32, name="input_projection") source_emb = input_project(source_emb) print("encoder input projection shape: {}".format( source_emb.get_shape().as_list())) enc_cells = self._create_encoder_cell() self.enc_outputs, self.enc_states = dynamic_rnn( enc_cells, source_emb, sequence_length=self.enc_seq_len, dtype=tf.float32) print("encoder output shape: {}".format( self.enc_outputs.get_shape().as_list())) with tf.variable_scope("decoder"): self.max_dec_seq_len = tf.reduce_max(self.dec_seq_len, name="max_dec_seq_len") self.dec_cells, self.dec_init_states = self._create_decoder_cell() # define input and output projection layer input_project = tf.layers.Dense(units=self.cfg.num_units, name="input_projection") self.dense_layer = tf.layers.Dense(units=self.cfg.vocab_size, name="output_projection") if self.mode == "train": # either "train" or "decode" # for training target_emb = input_project(target_emb) train_helper = TrainingHelper(target_emb, sequence_length=self.dec_seq_len, name="train_helper") train_decoder = BasicDecoder( self.dec_cells, helper=train_helper, output_layer=self.dense_layer, initial_state=self.dec_init_states) self.dec_output, _, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=self.max_dec_seq_len) print("decoder output shape: {} (vocab size)".format( self.dec_output.rnn_output.get_shape().as_list())) # for decode start_token = tf.ones( shape=[ self.batch_size, ], dtype=tf.int32) * self.cfg.target_dict[GO] end_token = self.cfg.target_dict[EOS] def inputs_project(inputs): return input_project( tf.nn.embedding_lookup(self.target_embs, inputs)) dec_helper = GreedyEmbeddingHelper(embedding=inputs_project, start_tokens=start_token, end_token=end_token) infer_decoder = BasicDecoder( self.dec_cells, helper=dec_helper, initial_state=self.dec_init_states, output_layer=self.dense_layer) infer_dec_output, _, _ = dynamic_decode( infer_decoder, maximum_iterations=self.cfg.maximum_iterations) self.dec_predicts = infer_dec_output.sample_id
def buildModel(self): T_in = self.args.T_in T_out = self.args.T_out D_in = self.args.D_in D_out = self.args.D_out E = self.args.embedding_dim H = self.args.hidden_dim SOS = self.args.SOS EOS = self.args.EOS PAD = self.args.PAD beam_width = 3 # Input with tf.name_scope('input'): x = tf.placeholder(shape=(None, T_in), dtype=tf.int32, name='encoder_inputs') # N, T_out y = tf.placeholder(shape=(None, T_out), dtype=tf.int32, name='decoder_inputs') # N x_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # N y_len = tf.placeholder(shape=(None, ), dtype=tf.int32) # dynamic sample num batch_size = tf.shape(x)[0] # symbol mask sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD # input mask x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32) y_with_sos_mask = tf.sequence_mask(y_len, T_out + 1, dtype=tf.float32) y_with_pad = tf.concat([y, pad], axis=1) eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS # masked inputs y_with_eos = y_with_pad + eos_mask y_with_sos = tf.concat([sos, y], axis=1) ## Embedding with tf.name_scope('embedding'): if self.args.use_pretrained: embedding_pretrained = np.fromfile(self.args.pretrained_file, dtype=np.float32).reshape( (-1, E)) embedding = tf.Variable(embedding_pretrained, trainable=False) else: embedding = tf.get_variable(name='embedding', shape=(D_in, E), dtype=tf.float32, initializer=xavier_initializer()) e_x = tf.nn.embedding_lookup(embedding, x) e_y = tf.nn.embedding_lookup(embedding, y_with_sos) if self.args.mode == 'train': e_x = tf.nn.dropout(e_x, self.args.keep_prob) ## Encoder with tf.name_scope('encoder'): ## Multi-BiLSTM fw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bw_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, e_x, sequence_length=x_len, dtype=tf.float32, time_major=False, scope=None) encoder_output = bi_encoder_output[0] + bi_encoder_output[1] encoder_final_state = bi_encoder_state[0] ## Decoder with tf.name_scope('decoder'): decoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(num_units=H) for i in range(self.args.layer_size) ]) decoder_lengths = tf.ones(shape=[batch_size], dtype=tf.int32) * (T_out + 1) ## Trainning decoder with tf.variable_scope('attention'): attention_mechanism = LuongAttention( num_units=H, memory=encoder_output, memory_sequence_length=x_len, name='attention_fn') projection_layer = Dense(units=D_out, kernel_initializer=xavier_initializer()) train_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=H) train_decoder_init_state = train_decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_final_state) training_helper = TrainingHelper(e_y, decoder_lengths, time_major=False) train_decoder = BasicDecoder( cell=train_decoder_cell, helper=training_helper, initial_state=train_decoder_init_state, output_layer=projection_layer) train_decoder_outputs, _, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=T_out + 1) # N, T_out+1, D_out train_decoder_outputs = ln(train_decoder_outputs.rnn_output) ## Beam_search decoder beam_memory = tile_batch(encoder_output, beam_width) beam_memory_state = tile_batch(encoder_final_state, beam_width) beam_memory_length = tile_batch(x_len, beam_width) with tf.variable_scope('attention', reuse=True): beam_attention_mechanism = LuongAttention( num_units=H, memory=beam_memory, memory_sequence_length=beam_memory_length, name='attention_fn') beam_decoder_cell = AttentionWrapper( cell=decoder_cell, attention_mechanism=beam_attention_mechanism, attention_layer_size=None) beam_decoder_init_state = beam_decoder_cell.zero_state( batch_size=batch_size * beam_width, dtype=tf.float32).clone(cell_state=beam_memory_state) start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS beam_decoder = BeamSearchDecoder( cell=beam_decoder_cell, embedding=embedding, start_tokens=start_tokens, end_token=EOS, initial_state=beam_decoder_init_state, beam_width=beam_width, output_layer=projection_layer) beam_decoder_outputs, _, _ = dynamic_decode( beam_decoder, scope=tf.get_variable_scope(), maximum_iterations=T_out + 1) beam_decoder_result_ids = beam_decoder_outputs.predicted_ids with tf.name_scope('loss'): logits = tf.nn.softmax(train_decoder_outputs) cross_entropy = tf.keras.losses.sparse_categorical_crossentropy( y_with_eos, logits) loss_mask = tf.sequence_mask(y_len + 1, T_out + 1, dtype=tf.float32) loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast( batch_size, dtype=tf.float32) prediction = tf.argmax(logits, 2) ## train_op with tf.name_scope('train'): global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) ## gradient clips trainable_params = tf.trainable_variables() gradients = tf.gradients(loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.args.gradient_clip_num) train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params), global_step=global_step) # Summary with tf.name_scope('summary'): tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
def Tensor_Generate(self): placeholder_Dict = self.pattern_Feeder.placeholder_Dict with tf.variable_scope('speaker_Embedding') as scope: batch_Size = tf.shape(placeholder_Dict["Mel"])[0] input_Activation = tf.layers.dense( inputs=placeholder_Dict["Mel"], units=speaker_Embedding_Parameters.embedding_Size) rnn_Cell = MultiRNNCell(cells=[ ResidualWrapper( MultiRNNCell(cells=[ ResidualWrapper( LSTMCell(num_units=768, num_proj=speaker_Embedding_Parameters. embedding_Size, activation=tf.nn.tanh)), ResidualWrapper( LSTMCell(num_units=768, num_proj=speaker_Embedding_Parameters. embedding_Size, activation=tf.nn.tanh)), ])), LSTMCell(num_units=768, num_proj=speaker_Embedding_Parameters.embedding_Size, activation=tf.nn.tanh), ]) helper = TrainingHelper( inputs=input_Activation, sequence_length=placeholder_Dict["Mel_Length"], time_major=False) decoder_Initial_State = rnn_Cell.zero_state(batch_size=batch_Size, dtype=tf.float32) final_Outputs, final_States, final_Sequence_Lengths = dynamic_decode( decoder=BasicDecoder(rnn_Cell, helper, decoder_Initial_State), maximum_iterations=speaker_Embedding_Parameters. pattern_Frame_Range[1], ) #hidden_Activation = tf.nn.sigmoid(final_Outputs.rnn_output[:, -1, :]); hidden_Activation = final_Outputs.rnn_output[:, -1, :] embedding_Activation = tf.nn.l2_normalize(hidden_Activation, axis=1) self.averaged_Embedding_Tensor = tf.reduce_mean( embedding_Activation, axis=0) #For single wav if not self.is_Training: self.tf_Session.run(tf.global_variables_initializer()) return #Back-prob. with tf.variable_scope('training_Loss') as scope: speaker_Size = tf.cast( batch_Size / speaker_Embedding_Parameters.batch_Pattern_per_Speaker, tf.int32) reshaped_Embedding_Activation = tf.reshape( embedding_Activation, shape=( speaker_Size, speaker_Embedding_Parameters.batch_Pattern_per_Speaker, speaker_Embedding_Parameters.embedding_Size, ) #[speaker, pattern_per_Speaker, embedding] ) centroid_for_Within = ( tf.tile( tf.reduce_sum(reshaped_Embedding_Activation, axis=1, keepdims=True), #[speaker, 1, embedding] multiples=[ 1, speaker_Embedding_Parameters. batch_Pattern_per_Speaker, 1 ] #[speaker, pattern_per_Speaker, embedding] ) - reshaped_Embedding_Activation) / ( speaker_Embedding_Parameters.batch_Pattern_per_Speaker - 1 ) #[speaker, pattern_per_Speaker, embedding] centroid_for_Between = tf.reduce_mean( reshaped_Embedding_Activation, axis=1) #[speaker, embedding] cosine_Similarity_Weight = tf.Variable( 10.0, name='cosine_Similarity_Weight', trainable=True) cosine_Similarity_Bias = tf.Variable(-5.0, name='cosine_Similarity_Bias', trainable=True) within_Cosine_Similarity = cosine_Similarity_Weight * Cosine_Similarity( reshaped_Embedding_Activation, centroid_for_Within ) - cosine_Similarity_Bias #[speaker, pattern_per_Speaker] between_Cosine_Similarity_Filter = 1 - tf.tile( tf.expand_dims(tf.eye(speaker_Size), axis=1), multiples=[ 1, speaker_Embedding_Parameters.batch_Pattern_per_Speaker, 1 ]) #[speaker, pattern_per_Speaker, Speaker] between_Cosine_Similarity = tf.reshape( cosine_Similarity_Weight * Cosine_Similarity2D( embedding_Activation, centroid_for_Between) - cosine_Similarity_Bias, #[speaker * pattern_per_Speaker, speaker] shape=( speaker_Size, speaker_Embedding_Parameters.batch_Pattern_per_Speaker, speaker_Size, )) #[speaker, pattern_per_Speaker, Speaker] between_Cosine_Similarity = tf.reshape( tf.boolean_mask(between_Cosine_Similarity, between_Cosine_Similarity_Filter), shape=( speaker_Size, speaker_Embedding_Parameters.batch_Pattern_per_Speaker, speaker_Size - 1, ) ) #[speaker, pattern_per_Speaker, Speaker - 1] Same speaker of first dimension was removed at last dimension. ##softmax_Loss = within_Cosine_Similarity - tf.log(tf.reduce_sum(tf.exp(tf.concat([tf.expand_dims(within_Cosine_Similarity, axis=2), between_Cosine_Similarity], axis=2)), axis = 2)); softmax_Loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.concat([ tf.expand_dims(within_Cosine_Similarity, axis=2), between_Cosine_Similarity ], axis=2), labels=tf.zeros(shape=( speaker_Size, speaker_Embedding_Parameters.batch_Pattern_per_Speaker), dtype=tf.int32)) #Almost same contrast_Loss = 1 - tf.nn.sigmoid( within_Cosine_Similarity) + tf.reduce_max( between_Cosine_Similarity, axis=2) if speaker_Embedding_Parameters.loss_Method.upper( ) == "Softmax".upper(): loss = tf.reduce_mean(softmax_Loss) elif speaker_Embedding_Parameters.loss_Method.upper( ) == "Contrast".upper(): loss = tf.reduce_mean(contrast_Loss) global_Step = tf.Variable(0, name='global_Step', trainable=False) #Noam decay of learning rate step = tf.cast(global_Step + 1, dtype=tf.float32) warmup_Steps = 4000.0 learning_Rate = speaker_Embedding_Parameters.learning_Rate * warmup_Steps**0.5 * tf.minimum( step * warmup_Steps**-1.5, step**-0.5) #Weight update. We use the ADAM optimizer optimizer = tf.train.AdamOptimizer(learning_Rate) gradients, variables = zip(*optimizer.compute_gradients(loss)) clipped_Gradients, global_Norm = tf.clip_by_global_norm( gradients, 1.0) optimize = optimizer.apply_gradients(zip(clipped_Gradients, variables), global_step=global_Step) self.training_Tensor_List = [ global_Step, learning_Rate, loss, optimize ] self.test_Tensor_List = [global_Step, embedding_Activation] if not os.path.exists(speaker_Embedding_Parameters.extract_Path + "/Summary"): os.makedirs(speaker_Embedding_Parameters.extract_Path + "/Summary") graph_Writer = tf.summary.FileWriter( speaker_Embedding_Parameters.extract_Path + "/Summary", self.tf_Session.graph) graph_Writer.close() self.tf_Session.run(tf.global_variables_initializer())
def __init__(self, inputs, targets, src_vocab_size, src_max_length, tgt_vocab_size, tgt_max_length, emb_dim, num_units, batch_size, eos_token, is_train, share_embeddings=False, teacher_forcing=False): xavier = tf.contrib.layers.xavier_initializer start_tokens = tf.zeros([batch_size], dtype=tf.int32) input_lengths = tf.argmin(tf.abs(inputs - eos_token), axis=-1, output_type=tf.int32) target_lengths = tf.argmin(tf.abs(targets - eos_token), axis=-1, output_type=tf.int32) input_embedding_table = tf.get_variable("encoder_embedding", [src_vocab_size, emb_dim], initializer=xavier(), dtype=tf.float32) input_embedding = tf.nn.embedding_lookup(input_embedding_table, inputs) encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, state_is_tuple=False) encoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=encoder_cell, input_keep_prob=0.8, output_keep_prob=1.0) # encoder_outputs: [max_time, batch_size, num_units] # encoder_state: [batch_size, num_units] (encoder_output, encoder_state) = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=input_embedding, sequence_length=input_lengths, dtype=tf.float32, time_major=False) encoder_output = tf.concat(encoder_output, axis=2) encoder_state = tf.concat([encoder_state[0], encoder_state[1]], axis=1) if share_embeddings: assert src_vocab_size == tgt_vocab_size target_embedding_table = input_embedding_table else: target_embedding_table = tf.get_variable("decoder_embedding", [src_vocab_size, emb_dim], initializer=xavier(), dtype=tf.float32) prefixed_targets = tf.concat([tf.expand_dims(start_tokens, 1), targets], axis=1) target_embedding = tf.nn.embedding_lookup(target_embedding_table, prefixed_targets) if teacher_forcing: helper = TrainingHelper(target_embedding, target_lengths + 1, time_major=False) else: helper = GreedyEmbeddingHelper(target_embedding_table, start_tokens, eos_token) decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units * 2, state_is_tuple=False) projection_layer = tf.layers.Dense(tgt_vocab_size, use_bias=False) attention_mechanism = BahdanauAttention(num_units, encoder_output, memory_sequence_length=input_lengths) decoder_cell = AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=num_units) #decoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=decoder_cell, # input_keep_prob=0.8, # output_keep_prob=1.0) encoder_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state) decoder = BasicDecoder(cell=decoder_cell, helper=helper, initial_state=encoder_state, output_layer=projection_layer) decoder_outputs, states, lengths = dynamic_decode(decoder, output_time_major=False, impute_finished=True, maximum_iterations=tgt_max_length) unpadded_logits = decoder_outputs.rnn_output missing_elems = tgt_max_length - tf.shape(unpadded_logits)[1] padding = [[0, 0], [0, missing_elems], [0, 0]] logits = tf.pad(unpadded_logits, padding, 'CONSTANT', constant_values=0.) weights = tf.sequence_mask(target_lengths + 1, # the "+1" is to include EOS maxlen=tgt_max_length, dtype=tf.float32) #self.mle_loss = sequence_loss(targets=targets, # logits=logits, # weights=weights, # average_across_batch=True) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits) mle_loss = (tf.reduce_sum(crossent * weights) / batch_size) preds = decoder_outputs.sample_id self.preds = preds self.logits = logits self.mle_loss = mle_loss
def decode_train(self, dec_input_tokens, dec_lengths, init_state, *attention_args, decoder_class=BasicDecoder, decoder_kwoptions={}): ''' <Args> - dec_input_tokens: - dec_length: - init_state: - decoder_class: - decoder_options: ''' with tf.variable_scope(self.scope or "Decoder") as scope: train_cell, init_state = self.setup_decoder_cell( self.config, self.keep_prob, False, init_state, *attention_args) self.input_project = tf.layers.Dense(units=self.config.hidden_size, name="input_projection", activation=self.activation) if hasattr(self.config, 'use_emb_as_out_proj') and \ self.config.use_emb_as_out_proj == True: # Make the dim of decoder's output be hidden_size to emb_size. emb_project = tf.layers.Dense(units=self.config.hidden_size, use_bias=False, activation=None, name='emb_projection') output_kernel = emb_project(self.embeddings) output_kernel = tf.transpose(output_kernel) self.output_project = SharedKernelDense( units=shape(self.embeddings, 0), shared_kernel=output_kernel, use_bias=False, activation=None, name='output_projection') else: self.output_project = tf.layers.Dense(units=shape( self.embeddings, 0), name='output_projection', use_bias=False, activation=None) #use_bias=False, trainable=False) # self.output_project = tf.layers.Dense(units=shape(self.embeddings, 0), # name='output_projection') with tf.name_scope('Train'): inputs = tf.nn.embedding_lookup(self.embeddings, dec_input_tokens) inputs = self.input_project(inputs) inputs = tf.nn.dropout(inputs, self.keep_prob) helper = TrainingHelper(inputs, sequence_length=dec_lengths, time_major=False) train_decoder = decoder_class(train_cell, helper, init_state, output_layer=self.output_project, **decoder_kwoptions) max_dec_len = tf.reduce_max(dec_lengths, name="max_dec_len") outputs, final_state, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=max_dec_len, scope=scope) logits = outputs.rnn_output # To prevent the training loss to be NaN. logits += 1e-9 logits = tf.clip_by_value(logits, -20.0, 20.0, name='clip_logits') self.train_decoder = train_decoder return logits, final_state
def build_decoder(self, encoder_outputs, encoder_final_state): """ 构建完整解码器 :return: """ with tf.variable_scope("decode"): decoder_cell, decoder_initial_state = self.build_decoder_cell( encoder_outputs, encoder_final_state, self.hidden_size, self.cell_type, self.layer_size) # 输出层投影 decoder_output_projection = layers.Dense( self.decoder_vocab_size, dtype=tf.float32, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1), name='decoder_output_projection') if self.mode == 'train': # 训练模式 decoder_inputs_embdedded = tf.nn.embedding_lookup( self.decoder_embeddings, self.decoder_inputs_train) ''' TrainingHelper用于train阶段,next_inputs方法一样也接收outputs与sample_ids,但是只是从初始化时的inputs返回下一时刻的输入。 TrainingHelper __init__( inputs, sequence_length, time_major=False, name=None ) - inputs: A (structure of) input tensors. - sequence_length: An int32 vector tensor. - time_major: Python bool. Whether the tensors in inputs are time major. If False (default), they are assumed to be batch major. - name: Name scope for any created operations. inputs:对应Decoder框架图中的embedded_input,time_major=False的时候,inputs的shape就是[batch_size, sequence_length, embedding_size] ,time_major=True时,inputs的shape为[sequence_length, batch_size, embedding_size] sequence_length:这个文档写的太简略了,不过在源码中可以看出指的是当前batch中每个序列的长度(self._batch_size = array_ops.size(sequence_length))。 time_major:决定inputs Tensor前两个dim表示的含义 name:如文档所述 ''' training_helper = TrainingHelper( inputs=decoder_inputs_embdedded, sequence_length=self.decoder_inputs_length, name='training_helper') ''' BasicDecoder的作用就是定义一个封装了decoder应该有的功能的实例,根据Helper实例的不同,这个decoder可以实现不同的功能,比如在train的阶段,不把输出重新作为输入,而在inference阶段,将输出接到输入。 BasicDecoder __init__( cell, helper, initial_state, output_layer=None ) - cell: An RNNCell instance. - helper: A Helper instance. - initial_state: A (possibly nested tuple of…) tensors and TensorArrays. The initial state of the RNNCell. - output_layer: (Optional) An instance of tf.layers.Layer, i.e., tf.layers.Dense. Optional layer to apply to the RNN output prior to storing the result or sampling. cell:在这里就是一个多层LSTM的实例,与定义encoder时无异 helper:这里只是简单说明是一个Helper实例,第一次看文档的时候肯定还不知道这个Helper是什么,不用着急,看到具体的Helper实例就明白了 initial_state:encoder的final state,类型要一致,也就是说如果encoder的final state是tuple类型(如LSTM的包含了cell state与hidden state),那么这里的输入也必须是tuple。直接将encoder的final_state作为这个参数输入即可 output_layer:对应的就是框架图中的Dense_Layer,只不过文档里写tf.layers.Dense,但是tf.layers下只有dense方法,Dense的实例还需要from tensorflow.python.layers.core import Dense。 ''' training_decoder = BasicDecoder(decoder_cell, training_helper, decoder_initial_state, decoder_output_projection) max_decoder_length = tf.reduce_max(self.decoder_inputs_length) ''' 首先tf.contrib.seq2seq.dynamic_decode主要作用是接收一个Decoder类,然后依据Encoder进行解码,实现序列的生成(映射)。 其中,这个函数主要的一个思想是一步一步地调用Decoder的step函数(该函数接收当前的输入和隐层状态会生成下一个词),实现最后的一句话的生成。该函数类似tf.nn.dynamic_rnn。 ''' training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, maximum_iterations=max_decoder_length) ''' tf.sequence_mask函数返回的一个mask张量。经过tf.Session()打印可以得到一个array数据。 decoder_inputs_length范围内的数据用1填充,[decoder_inputs_length,max_decoder_length]区间用0填充 ''' self.masks = tf.sequence_mask(self.decoder_inputs_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') ''' tf.contrib.seq2seq.sequence_loss可以直接计算序列的损失函数,重要参数: logits:尺寸[batch_size, sequence_length, num_decoder_symbols] targets:尺寸[batch_size, sequence_length],不用做one_hot。 weights:[batch_size, sequence_length],即mask,滤去padding的loss计算,使loss计算更准确。 ''' self.loss = tf.contrib.seq2seq.sequence_loss( logits=training_decoder_output.rnn_output, targets=self.decoder_inputs, weights=self.masks, # mask,滤去padding的loss计算,使loss计算更准确。 average_across_timesteps=True, average_across_batch=True) else: # 预测模式 start_token = [DataUnit.START_INDEX] * self.batch_size end_token = DataUnit.END_INDEX ''' BeamSearchDecoder cell: An RNNCell instance. embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup. start_tokens: int32 vector shaped [batch_size], the start tokens. end_token: int32 scalar, the token that marks end of decoding. initial_state: A (possibly nested tuple of...) tensors and TensorArrays. beam_width: Python integer, the number of beams. output_layer: (Optional) An instance of tf.keras.layers.Layer, i.e., tf.keras.layers.Dense. Optional layer to apply to the RNN output prior to storing the result or sampling. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. coverage_penalty_weight: Float weight to penalize the coverage of source sentence. Disabled with 0.0. reorder_tensor_arrays: If True, TensorArrays' elements within the cell state will be reordered according to the beam search path. If the TensorArray can be reordered, the stacked form will be returned. Otherwise, the TensorArray will be returned as is. Set this flag to False if the cell state contains TensorArrays that are not amenable to reordering. ''' inference_decoder = BeamSearchDecoder( cell=decoder_cell, embedding=lambda x: tf.nn.embedding_lookup( self.decoder_embeddings, x), start_tokens=start_token, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_width, output_layer=decoder_output_projection) ''' 首先tf.contrib.seq2seq.dynamic_decode主要作用是接收一个Decoder类,然后依据Encoder进行解码,实现序列的生成(映射)。 其中,这个函数主要的一个思想是一步一步地调用Decoder的step函数(该函数接收当前的输入和隐层状态会生成下一个词),实现最后的一句话的生成。该函数类似tf.nn.dynamic_rnn。 ''' inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, maximum_iterations=self.max_decode_step) self.decoder_pred_decode = inference_decoder_output.predicted_ids self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1])