def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.xloc, self.yloc, self.m, self.num_batch = get_batch_data( ) # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.x_maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.y_maxlen)) self.xloc = tf.placeholder(tf.int32, shape=(None, hp.x_maxlen)) self.yloc = tf.placeholder(tf.int32, shape=(None, hp.y_maxlen)) self.m = tf.placeholder(tf.int32, shape=(None, hp.x_maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary src2idx, idx2src = load_src_vocab() des2idx, idx2des = load_des_vocab() self.hidden_units = hp.hidden_units # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(src2idx), num_units=self.hidden_units, scale=True, scope="enc_embed") clue_level = tf.random_poisson(shape=[1], lam=1, dtype=tf.int32) #clue_level = tf.Print(clue_level, [clue_level]) #self.enc_mask = tf.expand_dims(tf.cast(tf.equal(self.m, 1), tf.float32), 2) self.enc_mask = tf.expand_dims( tf.cast( tf.logical_and(tf.greater_equal(self.m, 1), tf.less_equal(self.m, clue_level)), tf.float32), 2) self.enc = tf.concat([self.enc, self.enc_mask], axis=2) self.hidden_units += 1 ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding( self.x, num_units=self.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.x_maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="enc_pe") tf.add_to_collection('explain_input', self.enc) ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=self.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward(self.enc, num_units=[ 4 * self.hidden_units, self.hidden_units ]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(des2idx), num_units=self.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding( self.decoder_inputs, vocab_size=hp.y_maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.y_maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="dec_pe") tf.add_to_collection('explain_input', self.dec) ## Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=self.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=self.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward with tf.variable_scope( "num_blocks_fc_dec_{}".format(i)): self.dec = feedforward(self.dec, num_units=[ 4 * self.hidden_units, self.hidden_units ]) self.loc_enc = self.enc self.loc_logits = attention_matrix(queries=self.loc_enc, keys=self.dec, num_units=self.hidden_units, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="copy_matrix") xloc_vec = tf.one_hot(self.xloc, depth=hp.y_maxlen, dtype=tf.float32) yloc_vec = tf.one_hot(self.yloc, depth=hp.y_maxlen, dtype=tf.float32) loc_label = tf.matmul(yloc_vec, tf.transpose(xloc_vec, [0, 2, 1])) self.loc_label_history = tf.cumsum(loc_label, axis=1, exclusive=True) # Final linear projection self.loc_logits = tf.transpose(self.loc_logits, [0, 2, 1]) self.loc_logits = tf.stack( [self.loc_logits, self.loc_label_history], axis=3) self.loc_logits = tf.squeeze(tf.layers.dense(self.loc_logits, 1), axis=[3]) x_masks = tf.tile(tf.expand_dims(tf.equal(self.x, 0), 1), [1, hp.y_maxlen, 1]) #y_masks = tf.tile(tf.expand_dims(tf.equal(self.y, 0), -1), [1, 1, hp.x_maxlen]) paddings = tf.ones_like(self.loc_logits) * (-1e6) self.loc_logits = tf.where(x_masks, paddings, self.loc_logits) # (N, T_q, T_k) #self.loc_logits = tf.where(y_masks, paddings, self.loc_logits) # (N, T_q, T_k) self.logits = tf.layers.dense(self.dec, len(des2idx)) self.final_logits = tf.concat([self.logits, self.loc_logits], axis=2) tf.add_to_collection('explain_output', self.final_logits) #self.final_logits = tf.Print(self.final_logits, [self.final_logits[0][0][-3:]], message="final_logits_last") #self.final_logits = tf.Print(self.final_logits, [self.final_logits[0][0][:3]], message="final_logits_first") self.preds = tf.to_int32(tf.argmax(self.final_logits, axis=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) if is_training: label = tf.one_hot(self.y, depth=len(des2idx), dtype=tf.float32) # A special case, when copy is open, we should not need unk label unk_pos = label[:, :, 1] copy_pos = tf.sign(tf.reduce_sum(loc_label, axis=2)) fix_pos = unk_pos * copy_pos #fix_pos = tf.Print(fix_pos, [tf.reduce_sum(unk_pos, axis=-1), tf.shape(unk_pos)], message="\nunk_pos", summarize=16) #fix_pos = tf.Print(fix_pos, [tf.reduce_sum(fix_pos, axis=-1), tf.shape(fix_pos)], message="\nfix_pos", summarize=16) fix_label = tf.expand_dims(label[:, :, 1] - fix_pos, axis=2) label = tf.concat( [label[:, :, :1], fix_label, label[:, :, 2:]], axis=-1) self.final_label = tf.concat([label, loc_label], axis=2) #self.final_label = tf.Print(self.final_label, [self.final_label[0][0][-3:]], message="final_label") # Loss self.min_logit_loc = min_logit_loc = tf.argmax( self.final_logits + (-1e6) * (1.0 - self.final_label), axis=-1) #min_logit_loc = tf.Print(min_logit_loc, [min_logit_loc[0]], message="min_logit_loc") self.min_label = tf.one_hot(min_logit_loc, depth=len(des2idx) + hp.x_maxlen, dtype=tf.float32) vocab_count = len(des2idx) + hp.x_maxlen - tf.reduce_sum( tf.cast(tf.equal(self.x, 0), dtype=tf.int32), axis=-1) #vocab_count = tf.Print(vocab_count, [vocab_count[0]], message="vocab_count") self.y_smoothed = label_smoothing_mask(self.min_label, vocab_count) #self.final_logits = tf.Print(self.final_logits, [self.final_logits[0][1][min_logit_loc[0][1]]], message="final_logits") #self.y_smoothed = tf.Print(self.y_smoothed, [self.y_smoothed[0][1][min_logit_loc[0][1]]], message="y_smoothed") self.loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.final_logits, labels=self.y_smoothed) #self.loss = tf.Print(self.loss, [self.final_label[0][1][min_logit_loc[0][1]]], message="final_label") #self.loss = tf.Print(self.loss, [self.loss[0][-3:]], message="loss_last") #self.loss = tf.Print(self.loss, [self.loss[0][:3]], message="loss_first") self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def eval(stage='test', checkpoint_file=None, is_dedup=False, clue_level=1): # Load graph g = Graph(is_training=False, clue_level=clue_level) print("Graph loaded") # Load data if stage == 'test': X, XLoc, M, Sources, Targets = load_test_data() else: X, XLoc, M, Sources, Targets = load_dev_data() src2idx, idx2src = load_src_vocab() des2idx, idx2des = load_des_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True num_gen = 0 num_copy = 0 num_unk_copy = 0 max_batch = 10 # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=config) as sess: if not checkpoint_file: checkpoint_file = tf.train.latest_checkpoint(hp.logdir) ## Restore parameters sv.saver.restore(sess, checkpoint_file) print("Restored! {}".format(checkpoint_file)) ## Get model name #mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name mname = checkpoint_file.split('/')[1] ## Inference if not os.path.exists(result_dir): os.mkdir(result_dir) with codecs.open( result_dir + "/" + mname + '.level{}'.format(clue_level) + '.' + stage, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in tqdm(range(min(max_batch, len(X) // hp.batch_size))): ### Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] xloc = XLoc[i * hp.batch_size:(i + 1) * hp.batch_size] m = M[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.y_maxlen), np.int32) preds_unk = np.zeros((hp.batch_size, hp.y_maxlen), np.int32) preds_xloc = np.zeros( (hp.batch_size, hp.x_maxlen), np.int32) - 1 preds_yloc = np.zeros( (hp.batch_size, hp.y_maxlen), np.int32) - 1 for j in range(hp.y_maxlen): _preds, loc_logits = sess.run( [g.preds, g.loc_logits], { g.x: x, g.y: preds_unk, g.m: m, g.xloc: preds_xloc, g.yloc: preds_yloc }) preds[:, j] = _preds[:, j] preds_unk[:, j] = _preds[:, j] preds_unk[preds_unk >= len(idx2des)] = 1 for i in range(hp.batch_size): xloc = np.zeros(hp.x_maxlen, dtype=np.int32) - 1 yloc = np.zeros(hp.y_maxlen, dtype=np.int32) - 1 source_words = sources[i].split() target_words = [] for idx in preds[i]: if idx in idx2des: target_words.append(idx2des[idx]) elif idx - len(idx2des) == len(source_words): target_words.append('</S>') else: cp_word_idx = idx - len(idx2des) cp_word = source_words[cp_word_idx] target_words.append(cp_word) source_sent_np = np.array(source_words) target_sent_np = np.array(target_words) source_wset = set(source_words) target_wset = set(target_words) for loc_id, w in enumerate(target_wset & source_wset): xloc[np.where(source_sent_np == w)] = loc_id yloc[np.where(target_sent_np == w)] = loc_id preds_xloc[i] = xloc preds_yloc[i] = yloc #print(loc_logits.shape) #print(loc_logits[0][j][:20]) #input() ### Write to file for source, target, m_, pred in zip( sources, targets, m, preds): # sentence-wise got_display = [] got = [] source_words = np.array(source.split()) for idx in pred: if idx in idx2des: num_gen += 1 got.append(idx2des[idx]) got_display.append(idx2des[idx] + '[{}]'.format(idx)) else: num_copy += 1 cp_word_idx = idx - len(idx2des) cp_word = source_words[cp_word_idx] got.append(cp_word) got_display.append(cp_word + '[{},{}]'.format( cp_word_idx, m_[cp_word_idx])) if cp_word not in des2idx: num_unk_copy += 1 if is_dedup: got = remove_dup(got) got_display = remove_dup(got_display) got = " ".join(got).split("</S>")[0].strip() got_display = " ".join(got_display).split( "</S>")[0].strip() #if got.count('</S>'): # last_char = got.index('</S>') #else: # last_char = len(got) #got = " ".join(got[:last_char+1]) #got_display = " ".join(got_display) fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n") fout.write("- analyse: " + got_display + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score) + "\n") score = corpus_bleu(list_of_refs, hypotheses, weights=(1, 0, 0, 0)) fout.write("Bleu@1 Score = " + str(100 * score) + "\n") score = corpus_bleu(list_of_refs, hypotheses, weights=(0, 1, 0, 0)) fout.write("Bleu@2 Score = " + str(100 * score) + "\n") score = corpus_bleu(list_of_refs, hypotheses, weights=(0, 0, 1, 0)) fout.write("Bleu@3 Score = " + str(100 * score) + "\n") score = corpus_bleu(list_of_refs, hypotheses, weights=(0, 0, 0, 1)) fout.write("Bleu@4 Score = " + str(100 * score) + "\n") fout.write("Generate / Copy / UNK Copy = {} / {} / {}".format( num_gen, num_copy, num_unk_copy))
epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all() if __name__ == '__main__': config = tf.ConfigProto() config.gpu_options.allow_growth = True # Load vocabulary src2idx, idx2src = load_src_vocab() des2idx, idx2des = load_des_vocab() # Construct graph g = Graph("train") print("Graph loaded") #with tf.Session(graph = g.graph) as sess: # tf.train.export_meta_graph('explain.meta') #exit() # Start session sv = tf.train.Supervisor(graph=g.graph, logdir=hp.logdir, save_model_secs=0) with sv.managed_session(config=config) as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break