def one_iteration(self, source, source_length, target, target_length, start_tokens, optimizer, gpu_index=0): with tf.device("/gpu:%d" % gpu_index): embedding = self.create_embedding() if self.output_layer: output_layer = self.create_output_layer() else: output_layer = None # Encoder (encoder_output, encoder_lengths, encoder_final_state) = self.encode( source, source_length, embedding) # Decoder cell & initial state (decoder_cell, decoder_initial_state) = self.get_decoder_cell_and_initial_state( encoder_output, encoder_lengths, encoder_final_state) (loss_ML, num_tokens, total_loss) = self.decode_train( tf.concat( [tf.expand_dims(start_tokens, axis=1), target], axis=1), target_length + 1, embedding, decoder_cell, decoder_initial_state, output_layer=output_layer ) # Get trainable variables # (up to now we already have all the seq2seq trainable vars) if self.trainable_variables == []: self.trainable_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="seq2seq") # Compute tower gradients grads = compute_grads(loss_ML, optimizer, self.trainable_variables) # Decoder -- beam (for inference) (sample_ids_beam, final_lengths_beam) = self.decode_beam( embedding, decoder_cell, decoder_initial_state, start_tokens, output_layer=output_layer) return (num_tokens, total_loss, grads, sample_ids_beam, final_lengths_beam)
def build_seq2seq(input_seqs, target_seqs, filtered_target_seqs, input_seq_lengths, target_seq_lengths, is_training): with tf.variable_scope("seq2seq"): with tf.device('/cpu:0'): reuse = False if get_PPL: keep_prob = tf.convert_to_tensor(1.0) else: keep_prob = get_keep_prob(dropout_rate, is_training) sequence_mask = get_sequence_mask(target_seq_lengths) unk_mask = get_mask(target_seqs, unk_indices) decoder_mask = tf.logical_and(sequence_mask, tf.logical_not(unk_mask)) decoder_mask_float = tf.cast(decoder_mask, tf.float32) # Embed inputs with tf.variable_scope("embedding"): embedding = create_embedding(embedding_word2vec_politeness, embedding_word2vec_movie, shared_vocab_size_politeness, shared_vocab_size_movie, new_vocab_size_politeness, new_vocab_size_movie, "seq2seq") embedded_input_seqs = tf.nn.embedding_lookup( embedding, input_seqs) embedded_target_seqs = tf.nn.embedding_lookup( embedding, target_seqs) # Optimizer optimizer = tf.train.AdamOptimizer(learning_rate) tower_grads = [] if credit_assignment: tower_grads_polite = [] sample_ids_lst = [] final_lengths_lst = [] sampled_sample_ids_lst = [] sampled_final_lengths_lst = [] reuse = False trainable_variables = [] num_tokens_lst = [] total_losses = [] for i in xrange(num_gpus): with tf.device("/gpu:%d" % (gpu_start_index + i)): with tf.variable_scope("seq2seq"): if (i == 1): reuse = True start = i * batch_size_per_gpu end = start + batch_size_per_gpu input_max_seq_length = tf.reduce_max( input_seq_lengths[start:end]) target_max_seq_length = tf.reduce_max( target_seq_lengths[start:end]) with tf.variable_scope("encoder", reuse=reuse): cell_fw = create_MultiRNNCell([hidden_size_encoder] * (num_layers_encoder // 2), keep_prob, num_proj=None, reuse=reuse) cell_bw = create_MultiRNNCell([hidden_size_encoder] * (num_layers_encoder // 2), keep_prob, num_proj=None, reuse=reuse) (encoder_outputs_original, encoder_final_state_original ) = bidirecitonal_dynamic_lstm( cell_fw, cell_bw, embedded_input_seqs[ start:end, :input_max_seq_length, :], input_seq_lengths[start:end]) [ encoder_outputs, encoder_seq_lengths, encoder_final_state ] = tf.cond(is_training, lambda: [ encoder_outputs_original, input_seq_lengths[start:end], encoder_final_state_original ], lambda: [ tf.contrib.seq2seq.tile_batch(encoder_outputs_original, beam_width), tf.contrib.seq2seq.tile_batch( input_seq_lengths[start:end], beam_width), tile_multi_cell_state(encoder_final_state_original) ]) # only works for decoder that has >1 layers! with tf.variable_scope("decoder", reuse=reuse): decoder_cell = create_MultiRNNCell( [hidden_size_decoder] * (num_layers_decoder), keep_prob, num_proj=vocab_size, memory=encoder_outputs, memory_seq_lengths=encoder_seq_lengths, reuse=reuse) decoder_zero_state = tf.cond( is_training, lambda: decoder_cell.zero_state( batch_size_per_gpu, tf.float32), lambda: decoder_cell.zero_state( batch_size_per_gpu * beam_width, tf.float32)) state_last = decoder_zero_state[-1].clone( cell_state=encoder_final_state[-1]) state_previous = encoder_final_state[:-1] decoder_initial_state = state_previous + ( state_last, ) # concat tuples # training helper (for teacher forcing) helper_train = tf.contrib.seq2seq.TrainingHelper( embedded_target_seqs[ start:end, :target_max_seq_length - 1, :], # get rid of end_token target_seq_lengths[start:end] - 1) # the length is thus decreased by 1 (decoder_outputs_train, _) = decode(decoder_cell, helper_train, initial_state=decoder_initial_state) (logits, _) = decoder_outputs_train # Get trainable_variables # (up to now we already have all the seq2seq trainable vars) if trainable_variables == []: trainable_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="seq2seq") loss_ML = tf.contrib.seq2seq.sequence_loss( logits, target_seqs[ start:end, 1:target_max_seq_length], # get rid of start_token decoder_mask_float[start:end, 1:target_max_seq_length]) num_tokens = tf.reduce_sum( decoder_mask_float[start:end, 1:target_max_seq_length]) num_tokens_lst.append(num_tokens) total_loss = loss_ML * num_tokens total_losses.append(total_loss) if polite_training: helper_sample = tf.contrib.seq2seq.SampleEmbeddingHelper( embedding, start_tokens[start:end], end_token) (decoder_outputs_sample, final_lengths_sample) = decode( decoder_cell, helper_sample, decoder_initial_state) (logits_sample, sample_ids_sample) = decoder_outputs_sample max_final_lengths_sample = tf.reduce_max( final_lengths_sample) sampled_sample_ids_lst.append( pad_and_truncate(sample_ids_sample, final_lengths_sample)) sampled_final_lengths_lst.append(final_lengths_sample) # Compute sampled sequence loss WITHOUT averaging (will do that later) decoder_mask_sample = get_sequence_mask( final_lengths_sample, dtype=tf.float32) seq_losses_sample = tf.contrib.seq2seq.sequence_loss( logits_sample, sample_ids_sample, decoder_mask_sample, average_across_timesteps=False, average_across_batch=False) if polite_training: with tf.variable_scope( "classifier"): # jump back to the classifier scope # Filter out tokens that the classifier doesn't know vocab_mask = tf.cast( sample_ids_sample < vocab_size_politeness, tf.int32) sample_ids_sample_classifier = sample_ids_sample * vocab_mask # Feed sampled ids to classifier (scores_RL, credit_weights_RL) = build_classifier( sample_ids_sample_classifier, final_lengths_sample, reuse) # Stop gradients from propagating back scores_RL_stop = tf.stop_gradient(scores_RL) credit_weights_RL_stop = tf.stop_gradient( credit_weights_RL) if thresholding: # Filter scores that are >= threshold and <= 1 - threshold filtered_scores_RL = tf.map_fn(filter_with_threshold, scores_RL_stop) else: filtered_scores_RL = scores_RL_stop with tf.variable_scope("seq2seq"): with tf.variable_scope("decoder", reuse=reuse): # Get valid mask for sampled sequence decoder_mask_classifier = tf.cast( tf.not_equal(sample_ids_sample, 0), tf.float32 ) # propagate back the whole sentence (including <end>) tiled_scores = tf.tile( # tile scores to 2D tf.expand_dims(filtered_scores_RL - baseline, axis=1), [1, max_final_lengths_sample]) if flip_polite: # if we actually want a rude dialogue system tiled_scores = -1.0 * tiled_scores # Compute seq losses for polite-RL seq_losses_classifier = ( beta * seq_losses_sample * decoder_mask_classifier / tf.reduce_sum(decoder_mask_classifier) * tiled_scores) if credit_assignment: grads_polite = tf.gradients( seq_losses_classifier, trainable_variables, grad_ys=credit_weights_RL_stop ) # credit weights as initial gradients grads_polite = zip_lsts( [grads_polite, trainable_variables]) tower_grads_polite.append(grads_polite) else: loss_polite = tf.reduce_sum(seq_losses_classifier) else: credit_weights_RL_stop = None with tf.variable_scope("seq2seq"): with tf.variable_scope("decoder", reuse=reuse): # Infer branch (beam search!) beam_search_decoder = tf.contrib.seq2seq.BeamSearchDecoder( decoder_cell, embedding, start_tokens[start:end], end_token, decoder_initial_state, beam_width, length_penalty_weight=length_penalty_weight) output_beam = tf.contrib.seq2seq.dynamic_decode( beam_search_decoder, # impute_finished=True, # cannot be used with Beamsearch maximum_iterations=max_iterations, swap_memory=True) sample_ids = output_beam[0].predicted_ids[:, :, 0] final_lengths = output_beam[2][:, 0] sample_ids_lst.append( pad_and_truncate(sample_ids, final_lengths)) final_lengths_lst.append(final_lengths) with tf.device("/gpu:%d" % (gpu_start_index + i)): with tf.variable_scope("seq2seq", reuse=reuse): # Compute loss loss = loss_ML if polite_training and not credit_assignment: loss = loss + loss_polite # Compute tower gradients grads = compute_grads(loss, optimizer, trainable_variables) tower_grads.append(grads) with tf.device('/cpu:0'): with tf.variable_scope("seq2seq"): # Concat sample ids and their respective lengths batch_sample_ids = tf.concat(sample_ids_lst, axis=0) batch_final_lengths = tf.concat(final_lengths_lst, axis=0) if polite_training: batch_sampled_sample_ids = tf.concat(sampled_sample_ids_lst, axis=0) batch_total_loss = tf.add_n(total_losses) batch_num_tokens = tf.add_n(num_tokens_lst) # Thus, the effective batch size is actually batch_size_per_gpu if polite_training and credit_assignment: apply_gradients_op = apply_multiple_grads( optimizer, [tower_grads, tower_grads_polite]) else: apply_gradients_op = apply_grads(optimizer, tower_grads) return (batch_sample_ids, batch_final_lengths, batch_total_loss, batch_num_tokens, apply_gradients_op, credit_weights_RL_stop, embedding)
def __init__( self, batch_size, vocab_size, embedding_size, hidden_size_encoder, hidden_size_decoder, max_iterations, start_token, end_token, unk_indices, num_layers_encoder=1, num_layers_decoder=1, attention_size=512, attention_layer_size=256, beam_width=10, length_penalty_weight=1.0, gpu_start_index=0, num_gpus=1, # set to 1 when testing learning_rate=0.001, clipping_threshold=5.0, feed_both_examples=False, use_max_margin=False, max_margin_weight=1.0, margin=0.1, reward_clipping_threshold=1.0, backward=False, # whether we are training a backward model feed_tensors=[], # when provided, placeholders are not used use_MMI_reward=False, MMI_weight=0.00, use_reranking_reward=False, reranking_weight=0.00, num_samples_reranking=2, use_gleu_reward=False, gleu_weight=0.00, softmax_temperature=1.0, beam_search=True ): # how many samples to use for baseline (RL training) self.batch_size = batch_size self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size_encoder = hidden_size_encoder self.hidden_size_decoder = hidden_size_decoder assert self.hidden_size_encoder * 2 == self.hidden_size_decoder self.max_iterations = max_iterations self.start_token = start_token self.end_token = end_token self.unk_indices = unk_indices self.num_layers_encoder = num_layers_encoder self.num_layers_decoder = num_layers_decoder self.attention_size = attention_size self.attention_layer_size = attention_layer_size self.beam_width = beam_width self.length_penalty_weight = length_penalty_weight self.gpu_start_index = gpu_start_index self.num_gpus = num_gpus self.learning_rate = learning_rate self.clipping_threshold = clipping_threshold self.feed_both_examples = feed_both_examples if self.feed_both_examples: assert self.batch_size % 2 == 0 self.half_batch_size = self.batch_size // 2 self.use_max_margin = use_max_margin if self.use_max_margin: assert self.feed_both_examples # if Should-Change, then feed_both_examples must be True self.max_margin_weight = max_margin_weight self.margin = margin self.beam_search = beam_search assert self.batch_size % self.num_gpus == 0 self.batch_size_per_gpu = self.batch_size // self.num_gpus self.feed_tensors = feed_tensors self.use_MMI_reward = use_MMI_reward self.MMI_weight = MMI_weight self.use_reranking_reward = use_reranking_reward self.reranking_weight = reranking_weight self.num_samples_reranking = num_samples_reranking self.use_gleu_reward = use_gleu_reward self.gleu_weight = gleu_weight self.RL_training = self.use_MMI_reward or self.use_reranking_reward or self.use_gleu_reward self.softmax_temperature = softmax_temperature if self.feed_both_examples: print("Feeding both examples...") assert self.batch_size % 2 == 0 self.norm_batch_size = self.batch_size // 2 else: self.norm_batch_size = self.batch_size # when feeding both examples, only first half are norm inputs if self.use_max_margin: print("Max margin weight: {}, margin: {}".format( self.max_margin_weight, self.margin)) # We are only performing RL training on the norm_batch_size part, which may or may not be the whole batch if self.use_MMI_reward: print("MMI weight:", self.MMI_weight) # self.softmax_temperature = 0.5 # print("softmax_temperature changed to {}".format(self.softmax_temperature)) else: self.MMI_weight = 0.0 if self.use_reranking_reward: print("Neural Reranking reward weight:", self.reranking_weight) else: self.reranking_weight = 0.0 if self.use_gleu_reward: print("GLEU reward weight:", self.gleu_weight) else: self.gleu_weight = 0.0 self.ML_weight = 1.0 - (self.MMI_weight + self.reranking_weight + self.gleu_weight) self.trainable_variables = [] if self.use_MMI_reward: self.trainable_variables_backward = [] # For a backward model, the namespace will be "seq2seq_backward" extra_str = "_backward" if backward else "" self.main_scope = "seq2seq" + extra_str + "/" with tf.device("/gpu:%d" % self.gpu_start_index): self.create_placeholders() # Tile only if we are not training and using beam search self.tile = tf.logical_and(tf.logical_not(self.is_training), self.beam_search) self.global_step = tf.get_variable(self.main_scope + "global_step", initializer=0, dtype=tf.int32, trainable=False) # Note: if feeding both examples, the first dimension of total_loss is twice as that of # loss_RL's (self.total_loss, max_margin_loss, num_tokens, loss_MMI, loss_gleu, loss_reranking, num_tokens_RL, self.batch_sample_ids_beam, self.batch_final_lengths_beam) = self.one_iteration( self.source, self.source_length, self.target, self.target_length, self.start_tokens) # This part is for monitoring PPL, not for training. if self.feed_both_examples: self.batch_num_tokens = num_tokens / 2.0 # We montior ML losses for both norm- and adv-data self.batch_total_loss = (tf.reduce_sum( self.norm(self.total_loss)), tf.reduce_sum( self.adv(self.total_loss))) else: self.batch_num_tokens = num_tokens self.batch_total_loss = tf.reduce_sum(self.total_loss) loss_terms = [] # when using max_margin, it must be a Should-Change strategy if self.use_max_margin: loss_ML = self.norm( self.total_loss ) # in this case we don't want to train on (adv-S, T) pairs num_tokens_ML = num_tokens / 2.0 else: # if self.feed_both_examples: # # This is just for code readability # # We could have just written loss_ML = self.total_loss / 2.0 # loss_ML = (self.norm(self.total_loss) + self.adv(self.total_loss)) / 2.0 # else: # loss_ML = self.total_loss loss_ML = self.total_loss num_tokens_ML = num_tokens loss_terms.append(self.ML_weight * tf.reduce_sum(loss_ML) / num_tokens_ML) if self.use_max_margin: # need to scale max_margin_weight by ML_weight to make the training stable # (instead of scaling with loss_ML + loss_RL) loss_terms.append( self.max_margin_weight * self.ML_weight * # tf.reduce_sum(max_margin_loss) / num_tokens_ML) tf.reduce_mean(max_margin_loss)) if self.RL_training and not self.use_max_margin: num_tokens_RL = num_tokens_RL / 2.0 # effectively double the RL_loss if self.use_MMI_reward: loss_terms.append(self.MMI_weight * tf.reduce_sum(loss_MMI) / num_tokens_RL) if self.use_reranking_reward: loss_terms.append(self.reranking_weight * tf.reduce_sum(loss_reranking) / num_tokens_RL) if self.use_gleu_reward: loss_terms.append(self.gleu_weight * tf.reduce_sum(loss_gleu) / num_tokens_RL) assert loss_terms != [] loss = tf.add_n(loss_terms) optimizer = tf.train.AdamOptimizer(self.learning_rate) grads = compute_grads(loss, optimizer, self.trainable_variables) self.apply_gradients_op = apply_grads( optimizer, [grads], clipping_threshold=self.clipping_threshold, global_step=self.global_step)
def __init__(self, batch_size, vocab_size, embedding_size, hidden_size_encoder, hidden_size_context, hidden_size_decoder, max_iterations, max_dialogue_length, start_token, end_token, unk_indices, num_layers_encoder=1, num_layers_context=1, num_layers_decoder=1, attention_size=512, attention_layer_size=256, beam_search=False, beam_width=10, length_penalty_weight=1.0, gpu_start_index=0, learning_rate=0.001, clipping_threshold=5.0, feed_both_examples=False, use_max_margin=False, max_margin_weight=1.0, margin=0.5): self.batch_size = batch_size self.half_batch_size = self.batch_size // 2 self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size_encoder = hidden_size_encoder self.hidden_size_context = hidden_size_context self.hidden_size_decoder = hidden_size_decoder self.dim_z = self.hidden_size_context # this decision is arbitrary self.create_context_initial_state_var = functools.partial( tf.get_variable, initializer=tf.zeros([self.batch_size, self.hidden_size_context]), dtype=tf.float32, trainable=False) self.dense = functools.partial(tf.layers.dense, units=self.dim_z, use_bias=True) self.max_iterations = max_iterations self.max_dialogue_length = max_dialogue_length assert self.max_dialogue_length > 0 self.start_token = start_token self.end_token = end_token self.unk_indices = unk_indices self.num_layers_encoder = num_layers_encoder self.num_layers_context = num_layers_context self.num_layers_decoder = num_layers_decoder self.attention_size = attention_size self.attention_layer_size = attention_layer_size self.beam_search = beam_search self.beam_width = beam_width self.length_penalty_weight = length_penalty_weight self.clipping_threshold = clipping_threshold self.feed_both_examples = feed_both_examples if self.feed_both_examples: assert self.batch_size % 2 == 0 self.use_max_margin = use_max_margin if self.use_max_margin: assert self.feed_both_examples self.max_margin_weight = max_margin_weight self.margin = margin self.prior_str = "prior" self.posterior_str = "posterior" """ context_input_acc put context inputs in reverse order. • When adding inputs, just concat from left • When using, we do not need to reverse it back because we are using bidirectional-lstm Note: If we don't use bidirectional-lstm, then we will need tf.reverse_sequence() """ self.trainable_variables = [] with tf.variable_scope("seq2seq"): self.create_placeholders() # Tile only if we are not training and using beam search self.tile = tf.logical_and(tf.logical_not(self.is_training), self.beam_search) context_input_acc = tf.get_variable( "context_input_acc", initializer=tf.zeros( [self.batch_size, 2, self.hidden_size_encoder * 2], dtype=tf.float32), trainable=False) self.global_step = tf.get_variable("global_step", initializer=0, dtype=tf.int32, trainable=False) # max_num_turns better not be less than 2, since we may just lose a whole dimension (i.e., axis=1)? max_num_turns = tf.maximum(tf.reduce_max(self.start_turn_index), 2) context_input_mask = tf.tile( tf.reshape(tf.greater(self.start_turn_index, 0), [self.batch_size, 1, 1]), # expand two dims [1, max_num_turns, self.hidden_size_encoder * 2]) # This multiplication resets context input that have start_turn_index == 0 previous_context_input = context_input_acc[:, : max_num_turns, :] * tf.cast( context_input_mask, tf.float32) optimizer = tf.train.AdamOptimizer(learning_rate) with tf.device("/gpu:%d" % gpu_start_index): (kl_loss, total_loss, num_tokens, max_margin_loss, context_input, sample_ids, final_lengths) = self.one_iteration(self.dialogue, self.turn_length, previous_context_input, self.start_turn_index, self.start_tokens) kl_loss_weight = tf.cond( self.tile, lambda: tf.minimum( 1.0 / 75000.0 * tf.cast(self.global_step, tf.float32), 1.0), lambda: 1.0) if self.use_max_margin: kl_loss = kl_loss[:self.half_batch_size] weighted_kl_loss = kl_loss_weight * tf.reduce_mean(kl_loss) if self.use_max_margin: num_tokens = num_tokens / 2.0 # Note: here total_loss is a 1-D vector (already summed with axis=1) cross_ent_loss = tf.reduce_sum( total_loss[:self.half_batch_size]) adv_cross_ent_loss = tf.reduce_sum( total_loss[self.half_batch_size:]) self.batch_total_loss = (cross_ent_loss, adv_cross_ent_loss) loss = (weighted_kl_loss + cross_ent_loss / num_tokens + self.max_margin_weight * tf.reduce_mean(max_margin_loss) ) # max_margin_loss shape=[self.half_batch_size] else: cross_ent_loss = tf.reduce_sum(total_loss) self.batch_total_loss = cross_ent_loss loss = weighted_kl_loss + cross_ent_loss / num_tokens self.batch_num_tokens = num_tokens grads = compute_grads(loss, optimizer, self.trainable_variables) # First context input will be repeated in the next batch, so we ignore it. assign_context_input_op = tf.assign( context_input_acc, context_input[:, 1:, :], validate_shape=False) # shape will be differnt on axis=1 with tf.control_dependencies([ assign_context_input_op ]): # make sure we update context_input_acc self.apply_gradients_op = apply_grads( optimizer, [grads], clipping_threshold=self.clipping_threshold, global_step=self.global_step) # Just for control dependencies self.batch_sample_ids_beam = tf.identity(sample_ids) self.batch_final_lengths_beam = tf.identity(final_lengths)
def __init__(self, batch_size, vocab_size, embedding_size, hidden_size_encoder, hidden_size_context, hidden_size_decoder, dim_z, max_iterations, max_dialogue_length, start_token, end_token, unk_indices, num_layers_encoder=1, num_layers_context=1, num_layers_decoder=1, attention_size=512, attention_layer_size=256, beam_width=10, length_penalty_weight=1.0, gpu_start_index=0, num_gpus=1, # set to 1 when testing learning_rate=0.001, clipping_threshold=5.0, truncated=True): self.batch_size = batch_size self.batch_size_per_gpu = batch_size // num_gpus self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size_encoder = hidden_size_encoder self.hidden_size_context = hidden_size_context self.hidden_size_decoder = hidden_size_decoder assert self.hidden_size_context * 2 == self.hidden_size_decoder self.dim_z = dim_z self.dense = functools.partial(tf.layers.dense, units=self.dim_z, use_bias=True) self.max_iterations = max_iterations self.max_dialogue_length = max_dialogue_length assert self.max_dialogue_length > 0 self.start_tokens = [start_token] * self.batch_size_per_gpu self.end_token = end_token self.unk_indices = unk_indices self.num_layers_encoder = num_layers_encoder self.num_layers_context = num_layers_context self.num_layers_decoder = num_layers_decoder self.attention_size = attention_size self.attention_layer_size = attention_layer_size self.beam_width = beam_width self.length_penalty_weight = length_penalty_weight self.num_gpus = num_gpus self.clipping_threshold = clipping_threshold self.truncated = truncated """ context_input_acc put context inputs in reverse order. • When adding inputs, just concat from left • When using, we do not need to reverse it back because we are using bidirectional-lstm Note: If we don't use bidirectional-lstm, then we will need tf.reverse_sequence() """ self.trainable_variables = [] with tf.variable_scope("seq2seq"): with tf.device('/cpu:0'): self.create_placeholders() context_input_acc = tf.get_variable( "context_input_acc", initializer=tf.zeros([self.batch_size, 2, self.hidden_size_encoder * 2], dtype=tf.float32), trainable=False) # max_num_turns better not be less than 2, since we may just lose a whole dimension (i.e., axis=1)? max_num_turns = tf.maximum(tf.reduce_max(self.start_turn_index), 2) context_input_mask = tf.tile( tf.reshape(tf.greater(self.start_turn_index, 0), [self.batch_size, 1, 1]), # expand two dims [1, max_num_turns, self.hidden_size_encoder * 2]) # This multiplication resets context input that have start_turn_index == 0 previous_context_input = context_input_acc[:, :max_num_turns, :] * tf.cast(context_input_mask, tf.float32) # Note: Make sure batch_size can be evenly divided by num_gpus [dialogue_lst, turn_length_lst, previous_context_input_lst, start_turn_index_lst] = [ tf.split(tensor, self.num_gpus, axis=0) for tensor in [self.dialogue, self.turn_length, previous_context_input, #cannot be less than 2, otherwise tf.map_fn will give error. self.start_turn_index]] optimizer = tf.train.AdamOptimizer(learning_rate) context_input_lst = [] sample_ids_beam_lst = [] final_lengths_beam_lst = [] num_tokens_lst = [] total_losses = [] tower_grads = [] for i in xrange(num_gpus): with tf.device("/gpu:%d" % (gpu_start_index + i)): (total_loss, num_tokens, context_input, sample_ids_beam, final_lengths_beam) = self.one_iteration( dialogue_lst[i], turn_length_lst[i], previous_context_input_lst[i], start_turn_index_lst[i], optimizer) # first turn will be repeated in the next batch, so we skip it context_input_lst.append(context_input[:, 1:, :]) sample_ids_beam_lst.append(sample_ids_beam) final_lengths_beam_lst.append(final_lengths_beam) grads = compute_grads( total_loss / num_tokens, optimizer, self.trainable_variables) tower_grads.append(grads) total_losses.append(total_loss) num_tokens_lst.append(num_tokens) with tf.device('/cpu:0'): context_input_concat = tf.concat(context_input_lst, axis=0) assign_context_input_op = tf.assign( context_input_acc, context_input_concat, validate_shape=False) # shape will be differnt on axis=1 with tf.control_dependencies([assign_context_input_op]): # make sure we update context_input_acc # Concat sample ids and their respective lengths self.batch_sample_ids_beam = tf.concat( sample_ids_beam_lst, axis=0) self.batch_final_lengths_beam = tf.concat( final_lengths_beam_lst, axis=0) self.batch_total_loss = tf.add_n(total_losses) self.batch_num_tokens = tf.add_n(num_tokens_lst) self.apply_gradients_op = apply_grads( optimizer, tower_grads, clipping_threshold=self.clipping_threshold)