def build(self): """Build the model """ print("Building the bow - sequence to sequence model ... ") vocab_size = self.vocab_size state_size = self.state_size enc_layers = self.enc_layers max_enc_bow = self.max_enc_bow num_paraphrase = self.num_paraphrase # Placeholders with tf.name_scope("placeholders"): enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") enc_lens = tf.placeholder(tf.int32, [None], "enc_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.max_len = tf.placeholder(tf.int32, (), "max_len") dec_bow = tf.placeholder(tf.int32, [None, None], "dec_bow") dec_bow_len = tf.placeholder(tf.int32, [None], "dec_bow_len") self.enc_inputs = enc_inputs self.enc_lens = enc_lens self.dec_bow = dec_bow self.dec_bow_len = dec_bow_len if(self.mode == "train"): enc_targets = tf.placeholder(tf.int32, [None, None], "enc_targets") enc_seq2seq_inputs = tf.placeholder( tf.int32, [None, num_paraphrase, None], "enc_seq2seq_inputs") enc_seq2seq_targets = tf.placeholder( tf.int32, [None, num_paraphrase, None], "enc_seq2seq_targets") enc_seq2seq_lens = tf.placeholder( tf.int32, [None, num_paraphrase], "enc_seq2seq_lens") dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") dec_targets = tf.placeholder(tf.int32, [None, None], "dec_targets") dec_lens = tf.placeholder(tf.int32, [None], "dec_lens") self.enc_targets = enc_targets self.enc_seq2seq_inputs = enc_seq2seq_inputs self.enc_seq2seq_targets = enc_seq2seq_targets self.enc_seq2seq_lens = enc_seq2seq_lens self.dec_inputs = dec_inputs self.dec_targets = dec_targets self.dec_lens = dec_lens enc_batch_size = tf.shape(enc_inputs)[0] max_len = self.max_len dec_batch_size = tf.shape(dec_bow)[0] max_dec_bow = tf.shape(dec_bow)[1] # Embedding with tf.variable_scope("embeddings"): embedding_matrix = tf.get_variable( name="embedding_matrix", shape=[vocab_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs) if(self.mode == "train"): dec_inputs = tf.nn.embedding_lookup(embedding_matrix, dec_inputs) dec_bow = tf.nn.embedding_lookup(embedding_matrix, dec_bow) # Encoder with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization enc_cell = [create_cell("enc-%d" % i, state_size, self.drop_out) for i in range(enc_layers)] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_inputs, sequence_length=enc_lens, dtype=tf.float32) # Encoder bow prediction with tf.variable_scope("bow_output"): if(self.bow_pred_method == "mix_softmax"): bow_topk_prob = bow_predict_mix_softmax( enc_batch_size, vocab_size, max_enc_bow, enc_state) elif(self.bow_pred_method == "seq_tag"): bow_topk_prob, _, _, _ = bow_predict_seq_tag( vocab_size, enc_batch_size, enc_outputs, enc_lens, max_len) elif(self.bow_pred_method == "seq2seq"): bow_topk_prob, enc_seq2seq_loss, enc_infer_pred = \ bow_predict_seq2seq(enc_seq2seq_inputs, enc_seq2seq_targets, enc_seq2seq_lens, embedding_matrix, enc_outputs, enc_state, enc_layers, num_paraphrase, max_len, enc_lens, enc_batch_size, vocab_size, state_size, self.drop_out, self.dec_start_id) with tf.variable_scope("enc_optimizer"): enc_optimizer = tf.train.AdamOptimizer(self.learning_rate_enc) with tf.name_scope("enc_output"): # top k prediction pred_prob, pred_ind = tf.nn.top_k(bow_topk_prob, max_enc_bow) pred_prob_unnorm = pred_prob pred_prob /= tf.expand_dims(tf.reduce_sum(pred_prob, axis=1), [1]) pred_prob_dec, pred_ind_dec = tf.nn.top_k(bow_topk_prob, self.sample_size) pred_prob_dec /= tf.expand_dims(tf.reduce_sum(pred_prob_dec, axis=1), [1]) if(self.mode == "train"): with tf.name_scope("enc_loss"): # loss function enc_targets = _enc_target_list_to_khot( enc_targets, vocab_size, self.pad_id) enc_bow_loss = enc_loss_fn( self.bow_loss_fn, enc_targets, bow_topk_prob, max_enc_bow) if(self.bow_pred_method == "seq2seq"): # pure sequence to sequence for now enc_loss = enc_seq2seq_loss + 0.0 * enc_bow_loss else: enc_loss = enc_bow_loss enc_train_op = enc_optimizer.minimize(enc_loss) # prediction preformance monitor during training # write this in a function # TODO: top 10 recall with tf.name_scope("train_output"): # encoder training output self.enc_train_output = { "enc_train_op": enc_train_op, "enc_bow_loss": enc_bow_loss, "enc_loss": enc_loss} bow_metrics_dict = bow_train_monitor( bow_topk_prob, pred_ind, vocab_size, enc_batch_size, enc_targets) self.enc_train_output.update(bow_metrics_dict) if(self.bow_pred_method == "seq2seq"): self.enc_train_output["enc_seq2seq_loss"] = enc_seq2seq_loss # encoder inference output with tf.name_scope("infer_output"): if(self.bow_pred_method == "seq2seq"): (infer_overlap, infer_pred_support, infer_target_support, infer_prec, infer_recl) = bow_seq2seq_metrics( enc_targets, enc_infer_pred, vocab_size, self.pad_id) self.enc_infer_output = { "enc_infer_overlap": infer_overlap, "enc_infer_pred_support": infer_pred_support, "enc_infer_target_support": infer_target_support, "enc_infer_precision": infer_prec, "enc_infer_recall": infer_recl, "enc_infer_pred": enc_infer_pred} else: self.enc_infer_output = { "pred_prob": pred_prob, "pred_ind": pred_ind, "pred_prob_dec": pred_prob_dec, "pred_ind_dec": pred_ind_dec} # Decoder bow encoding # TODO: sampling from encoder topk prediction with tf.variable_scope("dec_bow_encoding"): dec_bow_mask = tf.expand_dims( tf.sequence_mask(dec_bow_len, max_dec_bow, dtype=tf.float32), [2]) # TODO: transformer based encoding, but our primary goal is to test the # effectiveness of sampling, so we skip it for now dec_bow_enc = tf.reduce_mean(dec_bow_mask * dec_bow, axis = 1) # [B, S] with tf.variable_scope("decoder"): dec_cell = [create_cell("dec-%d" % i, state_size, self.drop_out) for i in range(enc_layers)] dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell) dec_init_state = (LSTMStateTuple(dec_bow_enc, dec_bow_enc), LSTMStateTuple(dec_bow_enc, dec_bow_enc)) dec_proj = tf.layers.Dense(vocab_size, name="dec_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) dec_memory = dec_bow dec_mem_len = dec_bow_len dec_max_mem_len = max_dec_bow # greedy decoding # _, dec_outputs_predict = decoding_infer(self.dec_start_id, # dec_cell, # dec_proj, # embedding_matrix, # dec_init_state, # dec_bow, # dec_batch_size, # max_len, # dec_bow_len, # max_dec_bow, # self.is_attn) # if(self.mode == "train"): # # training decoding # dec_outputs_train = decoding_train( dec_inputs, # dec_cell, # dec_init_state, # dec_bow, # max_len, # dec_bow_len, # max_dec_bow, # self.is_attn) # dec_logits_train = dec_proj(dec_outputs_train) dec_outputs_predict, dec_logits_train = decode( self.dec_start_id, dec_inputs, dec_cell, dec_proj, embedding_matrix, dec_init_state, dec_memory, dec_mem_len, dec_max_mem_len, dec_batch_size, max_len, self.sampling_method, self.topk_sampling_size, state_size, multi_source=False) all_variables = slim.get_variables_to_restore() model_variables = [var for var in all_variables if var.name.split("/")[0] == self.model_name] print("%s model, variable list:" % self.model_name) for v in model_variables: print(" %s" % v.name) self.model_saver = tf.train.Saver(model_variables, max_to_keep=3) with tf.variable_scope("dec_optimizer"): dec_optimizer = tf.train.AdamOptimizer(self.learning_rate_dec) with tf.name_scope("dec_output"): if(self.mode == "train"): dec_mask = tf.sequence_mask(dec_lens, max_len, dtype=tf.float32) dec_loss = tf.contrib.seq2seq.sequence_loss( dec_logits_train, dec_targets, dec_mask) dec_train_op = dec_optimizer.minimize(dec_loss) self.dec_train_output = { "dec_train_op": dec_train_op, "dec_loss": dec_loss} self.dec_infer_output = {"dec_predict": dec_outputs_predict} return
def build(self): print("Building the language model ... ") vocab_size = self.vocab_size state_size = self.state_size enc_layers = self.enc_layers with tf.name_scope("placeholders"): enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") targets = tf.placeholder(tf.int32, [None, None], "targets") inp_lens = tf.placeholder(tf.int32, [None], "inp_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.enc_inputs = enc_inputs self.inp_lens = inp_lens self.targets = targets batch_size = tf.shape(enc_inputs)[0] max_len = tf.shape(enc_inputs)[1] with tf.variable_scope("embeddings"): embedding_matrix = tf.get_variable("embedding_matrix", [vocab_size, state_size]) enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs) with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization enc_cell = [ create_cell("enc-%d" % i, state_size, self.drop_out) for i in range(enc_layers) ] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn( enc_cell, enc_inputs, sequence_length=inp_lens, dtype=tf.float32) enc_proj = tf.layers.Dense(vocab_size, name="enc_proj") enc_logits = enc_proj(enc_outputs) mask = tf.sequence_mask(inp_lens, max_len, dtype=tf.float32) loss = tf.contrib.seq2seq.sequence_loss(enc_logits, targets, mask) # get variables before optimizer all_variables = slim.get_variables_to_restore() lm_variables = [ var for var in all_variables if var.name[:2] == "lm" ] print("lm model, variable list:") for v in lm_variables: print(" %s" % v.name) self.model_saver = tf.compat.v1.train.Saver(lm_variables, max_to_keep=10) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate) train_op = optimizer.minimize(loss) self.train_output = { "train_op": train_op, "loss": loss, "ppl": tf.exp(loss) } self.eval_output = {"loss": loss, "ppl": tf.exp(loss)} return
def bow_predict_seq2seq(enc_seq2seq_inputs, enc_seq2seq_targets, enc_seq2seq_lens, embedding_matrix, enc_outputs, enc_state, enc_layers, num_paraphrase, max_len, enc_lens, batch_size, vocab_size, state_size, drop_out, dec_start_id): """bow prediction as sequence to sequence""" enc_seq2seq_inputs = tf.nn.embedding_lookup( embedding_matrix, enc_seq2seq_inputs) # [B, P, T, S] -> [P, B, T, S] enc_seq2seq_inputs = tf.transpose(enc_seq2seq_inputs, [1, 0, 2, 3]) # [B, P, T] -> [P, B, T] enc_seq2seq_targets = tf.transpose(enc_seq2seq_targets, [1, 0, 2]) # [B, P] -> [P, B] enc_seq2seq_lens = tf.transpose(enc_seq2seq_lens, [1, 0]) init_state = enc_state enc_pred_loss = 0.0 bow_topk_prob = tf.zeros([batch_size, vocab_size]) enc_infer_pred = [] for i in range(num_paraphrase): # encoder prediction cell enc_pred_cell = [create_cell("enc_pred_p_%d_l_%d" % (i, j), state_size, drop_out) for j in range(enc_layers)] enc_pred_cell = tf.nn.rnn_cell.MultiRNNCell(enc_pred_cell) # projection enc_pred_proj = tf.layers.Dense(vocab_size, name="enc_pred_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) # greedy decoding and training _, enc_seq_predict = decoding_infer(dec_start_id, enc_pred_cell, enc_pred_proj, embedding_matrix, init_state, enc_outputs, batch_size, max_len, enc_lens, max_len, is_attn=True) enc_infer_pred.append(enc_seq_predict) enc_pred_inputs = enc_seq2seq_inputs[i] enc_seq_train = decoding_train( enc_pred_inputs, enc_pred_cell, init_state, enc_outputs, max_len, enc_lens, max_len, is_attn=True) enc_seq_train_logits = enc_pred_proj(enc_seq_train) # sequence to sequence loss enc_seq_mask = tf.sequence_mask( enc_seq2seq_lens[i], max_len, dtype=tf.float32) enc_seq_loss = tf.contrib.seq2seq.sequence_loss( enc_seq_train_logits, enc_seq2seq_targets[i], enc_seq_mask) enc_pred_loss += enc_seq_loss # prediction probability enc_pred_prob = tf.nn.softmax(enc_seq_train_logits) # [B, T, V] enc_pred_prob *= tf.expand_dims(enc_seq_mask, [2]) # [B, T, 1] enc_pred_prob = tf.reduce_sum(enc_pred_prob, axis=1) # [B, V] # NOTE: prob of certain words will be repeatedly calculated bow_topk_prob += enc_pred_prob enc_pred_loss /= num_paraphrase enc_infer_pred = tf.stack(enc_infer_pred) # [P, B, T] enc_infer_pred = tf.transpose(enc_infer_pred, [1, 0, 2]) # [B, P, T] return bow_topk_prob, enc_pred_loss, enc_infer_pred
def build(self): """Build the model""" print("Building the Latent BOW - sequence to sequence model ... ") vocab_size = self.vocab_size key_size = self.key_size state_size = self.state_size enc_layers = self.enc_layers max_enc_bow = self.max_enc_bow lambda_enc_loss = self.lambda_enc_loss # Placeholders with tf.name_scope("placeholders"): enc_keys = tf.placeholder(tf.int32, [None, None], "enc_keys") enc_locs = tf.placeholder(tf.int32, [None, None], "enc_locs") enc_vals = tf.placeholder(tf.int32, [None, None], "enc_vals") enc_lens = tf.placeholder(tf.int32, [None], "enc_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.gumbel_tau = tf.placeholder(tf.float32, (), "gumbel_tau") self.enc_keys = enc_keys self.enc_locs = enc_locs self.enc_vals = enc_vals self.enc_lens = enc_lens enc_targets = tf.placeholder(tf.int32, [None, None], "enc_targets") dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") dec_targets = tf.placeholder(tf.int32, [None, None], "dec_targets") dec_lens = tf.placeholder(tf.int32, [None], "dec_lens") self.enc_targets = enc_targets self.dec_inputs = dec_inputs self.dec_targets = dec_targets self.dec_lens = dec_lens batch_size = tf.shape(enc_keys)[0] max_enc_len = tf.shape(enc_keys)[1] max_dec_len = tf.shape(dec_targets)[1] # Embedding with tf.variable_scope("embeddings"): embedding_matrix_vals = tf.get_variable( name="embedding_matrix_vals", shape=[vocab_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) embedding_matrix_keys = tf.get_variable( name="embedding_matrix_keys", shape=[key_size, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) embedding_matrix_locs = tf.get_variable( name="embedding_matrix_locs", shape=[100, state_size], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.05)) enc_keys = tf.nn.embedding_lookup(embedding_matrix_keys, enc_keys) enc_vals = tf.nn.embedding_lookup(embedding_matrix_vals, enc_vals) enc_locs = tf.nn.embedding_lookup(embedding_matrix_locs, enc_locs) enc_inputs = (enc_keys + enc_vals + enc_locs) / 3. dec_inputs = tf.nn.embedding_lookup(embedding_matrix_vals, dec_inputs) # Encoder with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization enc_cell = [create_cell( "enc-%d" % i, state_size, self.drop_out, self.no_residual) for i in range(enc_layers)] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_inputs, sequence_length=enc_lens, dtype=tf.float32) # Encoder bow prediction with tf.variable_scope("bow_output"): bow_topk_prob, gumbel_topk_prob, seq_neighbor_ind, seq_neighbor_prob = \ bow_predict_seq_tag(vocab_size, batch_size, enc_outputs, enc_lens, max_enc_len, self.is_gumbel, self.gumbel_tau) seq_neighbor_output = {"seq_neighbor_ind": seq_neighbor_ind, "seq_neighbor_prob": seq_neighbor_prob} # Encoder output, loss and metrics with tf.name_scope("enc_output"): # top k prediction bow_pred_prob, pred_ind = tf.nn.top_k(bow_topk_prob, max_enc_bow) # loss function enc_targets = _enc_target_list_to_khot( enc_targets, vocab_size, self.pad_id) enc_loss = enc_loss_fn( self.bow_loss_fn, enc_targets, bow_topk_prob, max_enc_bow) self.train_output = {"enc_loss": enc_loss} # performance monitor bow_metrics_dict = bow_train_monitor( bow_topk_prob, pred_ind, vocab_size, batch_size, enc_targets) self.train_output.update(bow_metrics_dict) # Encoder soft sampling with tf.name_scope("gumbel_topk_sampling"): sample_ind, sample_prob, sample_memory = bow_gumbel_topk_sampling( gumbel_topk_prob, embedding_matrix_vals, self.sample_size, vocab_size) sample_memory_lens = tf.ones(batch_size, tf.int32) * self.sample_size sample_memory_avg = tf.reduce_mean(sample_memory, 1) # [B, S] sample_memory_output = {"bow_pred_ind": pred_ind, "bow_pred_prob": bow_pred_prob, "sample_memory_ind": sample_ind, "sample_memory_prob": sample_prob } # Decoder # The initial state of the decoder = # encoder meaning vector z + encoder bow vector b with tf.variable_scope("decoder"): dec_cell = [create_cell( "dec-%d" % i, state_size, self.drop_out, self.no_residual) for i in range(enc_layers)] dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell) dec_proj = tf.layers.Dense(vocab_size, name="dec_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) dec_ptr_k_proj = [ tf.layers.Dense(state_size, name="dec_ptr_k_proj_%d" % pi, kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.)) for pi in range(self.num_pointers)] dec_ptr_g_proj = tf.layers.Dense(1, name="dec_ptr_g_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.), activation=tf.nn.sigmoid) bow_cond_gate_proj = tf.layers.Dense(1, name="bow_cond_gate_proj", kernel_initializer=tf.random_normal_initializer(stddev=0.05), bias_initializer=tf.constant_initializer(0.), activation=tf.nn.sigmoid) dec_init_state = [] for l in range(enc_layers): dec_init_state.append(LSTMStateTuple(c=enc_state[0].c, h=enc_state[0].h + sample_memory_avg)) dec_init_state = tuple(dec_init_state) # if(enc_layers == 2): # dec_init_state = (LSTMStateTuple( c=enc_state[0].c, # h=enc_state[0].h + sample_memory_avg), # LSTMStateTuple( c=enc_state[1].c, # h=enc_state[1].h + sample_memory_avg) ) # elif(enc_layers == 4): # dec_init_state = (LSTMStateTuple(c=enc_state[0].c, # h=enc_state[0].h + sample_memory_avg), # LSTMStateTuple( c=enc_state[1].c, # h=enc_state[1].h + sample_memory_avg) ) # else: raise Exception('enc_layers not in [2, 4]') if(self.source_attn): # [B, M + T, S] dec_memory = [sample_memory, enc_outputs] dec_mem_len = [sample_memory_lens, enc_lens] dec_max_mem_len = [self.sample_size, max_enc_len] else: dec_memory = sample_memory dec_mem_len = sample_memory_lens dec_max_mem_len = tf.shape(dec_memory)[1] if(self.bow_cond): bow_cond = sample_memory_avg else: bow_cond = None if(self.bow_cond_gate == False): bow_cond_gate_proj = None (dec_outputs_predict, dec_logits_train, dec_prob_train, pointer_ent, avg_max_ptr, avg_num_copy) = decode( self.dec_start_id, dec_inputs, dec_cell, dec_proj, embedding_matrix_vals, dec_init_state, dec_memory, dec_mem_len, dec_max_mem_len, batch_size, max_dec_len, self.sampling_method, self.topk_sampling_size, state_size, multi_source=True, copy=self.copy, copy_ind=sample_ind, dec_ptr_g_proj=dec_ptr_g_proj, dec_ptr_k_proj=dec_ptr_k_proj, bow_cond=bow_cond, bow_cond_gate_proj=bow_cond_gate_proj) # model saver, before the optimizer all_variables = slim.get_variables_to_restore() model_variables = [var for var in all_variables if var.name.split("/")[0] == self.model_name] print("%s model, variable list:" % self.model_name) for v in model_variables: print(" %s" % v.name) self.model_saver = tf.train.Saver(model_variables, max_to_keep=3) with tf.variable_scope("optimizer"): optimizer = tf.train.AdamOptimizer(self.learning_rate) # decoder output, training and inference, combined with encoder loss with tf.name_scope("dec_output"): dec_mask = tf.sequence_mask(dec_lens, max_dec_len, dtype=tf.float32) if(self.copy == False): dec_loss = tf.contrib.seq2seq.sequence_loss( dec_logits_train, dec_targets, dec_mask) else: dec_loss = _copy_loss(dec_prob_train, dec_targets, dec_mask) loss = dec_loss + lambda_enc_loss * enc_loss train_op = optimizer.minimize(loss) dec_output = {"train_op": train_op, "dec_loss": dec_loss, "loss": loss} self.train_output.update(dec_output) if(self.copy): pointer_ent =\ tf.reduce_sum(pointer_ent * dec_mask) / tf.reduce_sum(dec_mask) self.train_output['pointer_ent'] = pointer_ent avg_max_ptr =\ tf.reduce_sum(avg_max_ptr * dec_mask) / tf.reduce_sum(dec_mask) self.train_output['avg_max_ptr'] = avg_max_ptr avg_num_copy = tf.reduce_sum(avg_num_copy * dec_mask, 1) avg_num_copy = tf.reduce_mean(avg_num_copy) self.train_output['avg_num_copy'] = avg_num_copy self.infer_output = {"dec_predict": dec_outputs_predict} dec_out_mem_ratio = _calculate_dec_out_mem_ratio(dec_outputs_predict, sample_ind, vocab_size, self.pad_id, self.dec_start_id, self.dec_end_id) self.infer_output.update(dec_out_mem_ratio) self.infer_output.update(sample_memory_output) self.infer_output.update(seq_neighbor_output) return