def train_seq2seq(input_filename, output_filename, model_dir): def input_fn(input_filename, output_filename, batch_size, shuffle_buffer=1): encoder_input_data_gen = lambda: data_loader.data_generator_3(input_filename, is_encoder_input=True) decoder_output_data_gen = lambda: data_loader.data_generator_3(output_filename) decoder_input_data_gen = lambda: data_loader.data_generator_3(output_filename, is_decoder_input=True) encoder_input_data = tf.data.Dataset.from_generator(encoder_input_data_gen, output_types=tf.int32, output_shapes=(None,)) decoder_output_data = tf.data.Dataset.from_generator(decoder_output_data_gen, output_types=tf.int32, output_shapes=(None,)) decoder_input_data = tf.data.Dataset.from_generator(decoder_input_data_gen, output_types=tf.int32, output_shapes=(None,)) dataset = tf.data.Dataset.zip((encoder_input_data, decoder_output_data, decoder_input_data)).shuffle(shuffle_buffer).repeat(1).padded_batch(batch_size, padded_shapes=([None],[None],[None])) iterator = dataset.make_one_shot_iterator() encoder_inputs, decoder_outputs, decoder_inputs = iterator.get_next() return {'encoder_inputs': encoder_inputs, 'decoder_outputs': decoder_outputs, 'decoder_inputs': decoder_inputs} est = tf.estimator.Estimator(model_fn=seq2seq, model_dir=model_dir, params=params) train_input_func = lambda: input_fn(cfg..source_data_train, cfg.target_data_train, cfg.batch_size, shuffle_buffer=1000) eval_input_func = lambda: input_fn(cfg.source_data_dev, cfg.target_data_dev, cfg.batch_size) test_input_func = lambda: input_fn(cfg.source_data_test, cfg.target_data_test, cfg.batch_size) # first train for 20000 stpes est.train(input_fn=train_input_func, steps=20000) for r in range(cfg.num_rounds): # training for num_steps steps print('\nRound', r + 1) est.train(input_fn=train_input_func, steps=cfg.num_steps) # evaluatation print('\nEvaluation:') predictions = est.predict(input_fn=dev_input_func) # writing the predictions into a file print('\n\nWriting Predictions...') for i, pred in enumerate(predictions): with open('./predictions/' + str(i), 'w+') as pred_file: for keyph in np.array(pred).T: pred_file.write(data_loader.index_to_sent(keyph).replace('<EOS>', '') .replace('<UNK>', '') .replace('<SOS>', '') + '\n') # running the evaluation metrics, precision, recall, f1-score, and ROUGE precision_and_recall(r) rouge(5) rouge(10)
def choosebest_ind(sentences): scores = [] for i in range(0, len(sentences)): with open(source_sen_file, "w") as w: w.write(sentences[i]) rouge(MAX_WORDS, config_file, rouge_out) score = parse_rouge(rouge_out, ver) scores.append(score) count = 0 best = [] for sen, scr in sorted(zip(sentences, scores), key=lambda (x, y): -y): best.append(sen) count += len(sen.split(" ")) if count > MAX_WORDS: break return best
def compute_rouge(sent_list, refs): sentext = "\n".join(sent_list) code = abs(hash(sentext)) name0 = "%s/mds/%d.txt" % (TMP, code) utils.write2file(sentext, name0) cfgline = name0 for i, sens in enumerate(refs): name1 = "%s/mds/%d.txt%d" % (TMP, code, i) utils.write2file("\n".join(sens), name1) cfgline = cfgline + " " + name1 cfgfile = "%s/mds/%d.cfg" % (TMP, code) utils.write2file(cfgline, cfgfile) rouge_out = "%s/mds/%d.rge" % (TMP, code) rge.rouge(1000, cfgfile, rouge_out) score = rge.parse_rouge(rouge_out, 2) + 0.0001 * rge.parse_rouge(rouge_out, 1) return score
def moses_bl_rouge(p, l): bl = bleu.moses_multi_bleu(p, l) x = rouge.rouge(p, l) print( 'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f' % (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'], x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'], x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
def choosebest_seq(sentences): best = [] count = 0 while True: mscore = -1 msen = None for sen in sentences: if sen in best: continue with open(source_sen_file, "w") as w: w.write("\n".join(best + [sen])) rouge(MAX_WORDS, config_file, rouge_out) score = parse_rouge(rouge_out, ver) if score > mscore: mscore = score msen = sen if msen is None: break best.append(msen) count += len(msen.split(" ")) if count > MAX_WORDS: break return best
def reward_function(self, reference, summary, measure='rouge_l/f_score'): """Calculate the reward between the reference and summary. Args: reference: A list of ids representing the ground-truth data summary: A list of ids representing the model generated data Returns: A single value representing the evaluation value for reference and summary """ if 'rouge' in measure: return rouge([summary],[reference])[measure] else: return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
def _rouge(ref_file, summarization_file, subword_option=None): """Compute ROUGE scores and handling BPE.""" references = [] with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh: for line in fh: references.append(_clean(line, subword_option)) hypotheses = [] with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file, "rb")) as fh: for line in fh: hypotheses.append(_clean(line, subword_option=None)) rouge_score_map = rouge.rouge(hypotheses, references) return 100 * rouge_score_map["rouge_l/f_score"]
def rouge_score(references, generated): """both are a list of strings""" score = rouge(generated, references) rouge_s = {k: (v * 100) for (k, v) in score.items()} ''' "rouge_1/f_score": rouge_1_f, "rouge_1/r_score": rouge_1_r, "rouge_1/p_score": rouge_1_p, "rouge_2/f_score": rouge_2_f, "rouge_2/r_score": rouge_2_r, "rouge_2/p_score": rouge_2_p, "rouge_l/f_score": rouge_l_f, "rouge_l/r_score": rouge_l_r, "rouge_l/p_score": rouge_l_p, ''' return rouge_s
def test_scratch(xtt,ytt,int_to_vocab,vocab_to_int,encoder_model,decoder_model,max_sl,max_rl): st=time.time() predictions = [] real_og=[] pred_op=[] c=0 b=50 for i in range(0,len(xtt)): #review review=seq_to_text(xtt[i],int_to_vocab) review=review.replace("<PAD>",'') #original summary og_summary=seq_to_summary(ytt[i],vocab_to_int,int_to_vocab) og_summary=og_summary.replace("<PAD>",'') real_og.append(str(og_summary)) #predicted summary predict_summary=decode_sequence(xtt[i].reshape(1,max_rl),encoder_model,decoder_model,vocab_to_int,int_to_vocab,max_sl) predict_summary=predict_summary.replace("<PAD>",'') pred_op.append(str(predict_summary)) #write to a text file name review_og_pred.txt predictions.append("review:"+review+"\t"+"orignal:"+og_summary+"\t"+"predicted:"+predict_summary+"\n") #this part is used to print output if the size of c is greater than b #limited output is print as only 5000 lines can be printed in colab whole output is written in a text file if c>b: print("Review: {}".format(review)) print("Original Summary: {}".format(og_summary)) print("Predicted Summary: {}".format(predict_summary)) b+=b c+=1 print("total time to complete {}".format(time.time()-st)) file = open("/content/drive/MyDrive/LSTMscore.txt","w") file.writelines(predictions) file.close() bleau=compute_bleu(real_og,pred_op, max_order=4,smooth=False) bscore=nltk.translate.bleu_score.corpus_bleu(real_og,pred_op) rougen=rouge_n(pred_op, real_og, n=2) ro=rouge(pred_op, real_og) print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau) print("bleau score",bscore) print("rouge2",rougen) print("rouge",ro)
def testT5(model,tokenizer,test_loader): #intialize the empty lists predictions = [] real_og=[] pred_op=[] c=0 b=1000 #for data in test loader for i, (input_ids, attention_mask, y) in enumerate(test_loader): input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) y = y.to(device) #generate summaries #store real and predicted summary in a list and write in txt file summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask,max_length=10) pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries] real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y] #this part is used to print output if the size of c is greater than b #limited output is print as only 5000 lines can be printed in colab whole output is written in a text file for pred_sent, real_sent in zip(pred, real): if c>b: print("Original: {}".format(real_sent)) print("Predicted: {}".format(pred_sent)) print("\n") b+=b real_og.append(real_sent) pred_op.append(pred_sent) predictions.append(str("pred sentence: " + pred_sent + "\t\t real sentence: " + real_sent+"\n")) c+=1 file1 = open("/content/drive/MyDrive/TFIVE.txt","w") file1.writelines(predictions) file1.close() #calculate scores bleau=compute_bleu(real_og,pred_op, max_order=4,smooth=False) bscore=nltk.translate.bleu_score.corpus_bleu(real_og,pred_op) rougen=rouge_n(pred_op, real_og, n=2) ro=rouge(pred_op, real_og) print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau) print("bleau score",bscore) print("rouge2",rougen) print("rouge",ro)
def get_metrics(f1,f2): ref = [] decoded = [] count = 0 for i, j in zip(sorted(glob.glob(f1)),sorted(glob.glob(f2))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) decoded.append(dec_tex) count = count + 1 bl = bleu.moses_multi_bleu(decoded,ref) x = rouge.rouge(decoded,ref) s = "\t%.2f\t%.2f\t%.2f\t%.2f"%(bl,x['rouge_1/f_score']*100,x['rouge_2/f_score']*100,x['rouge_l/f_score']*100) print(count) return s
def view_lstm(): f = open("/content/drive/MyDrive/LSTMscore.txt", "r") text=f.readlines() text=pd.DataFrame(text,columns=["value"]) text=text["value"].str.split("\t",expand=True) text.columns=["value","original","predicted"] text["original"]=text["original"].str.split(":").str[1] text["predicted"]=text["predicted"].str.split(":").str[1] text["predicted"]=text["predicted"].replace('\n','', regex=True) f.close() bleau=compute_bleu(text["original"],text["predicted"], max_order=4,smooth=False) bscore=nltk.translate.bleu_score.corpus_bleu(text["original"],text["predicted"]) rougen=rouge_n(text["predicted"], text["original"], n=2) ro=rouge(text["predicted"],text["original"]) print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau) print("bleau score",bscore) print("rouge2",rougen) print("rouge",ro) return text
def get_metrics(f1, f2): ref = [] decoded = [] count = 0 print(f1) print(f2) for i, j in zip(sorted(glob.glob(f1)), sorted(glob.glob(f2))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) decoded.append(dec_tex) count = count + 1 print(len(decoded)) print(len(ref)) x = rouge.rouge(decoded, ref) bl = bleu.moses_multi_bleu(decoded, ref) #replace by pycoco bleu return 0, 0, 0, bl
def view_t5_op(): #get the final cleaned data df=pd.read_csv('/content/drive/MyDrive/product_reviews.csv')[:147799] print("The length of dataset is ",len(df)) #set the threshold threshold = 20 max_rl=80 #maximum review length max_sl=10 #maximum summary length #get reviewText whose length is less than maximum review length df['reviewText']=df['reviewText'].str.slice(0,max_rl) #get summary whose length is less than maximum summary length df['summary']=df['summary'].str.slice(0,max_rl) f = open("/content/drive/MyDrive/TFIVE.txt", "r") text=f.readlines() text=pd.DataFrame(text,columns=["value"]) text=text["value"].str.split("\t",expand=True) text.columns=["predicted","value","original"] text.drop(columns=["value"],inplace=True) text["predicted"]=text["predicted"].str.split(":").str[1] text["original"]=text["original"].str.split(":").str[1] text["original"]=text["original"].replace('\n','', regex=True) f.close() bleau=compute_bleu(text["original"],text["predicted"], max_order=4,smooth=False) bscore=nltk.translate.bleu_score.corpus_bleu(text["original"],text["predicted"]) rougen=rouge_n(text["predicted"], text["original"], n=2) ro=rouge(text["predicted"],text["original"]) print("bleu, precisions, bp, ratio, translation_length, reference_length",bleau) print("bleau score",bscore) print("rouge2",rougen) print("rouge",ro) return df,text
short_summary['short'] = short_summary['short'].apply( lambda x: data_preprocessing(x)) actual_news['long'] = actual_news['long'].apply( lambda x: data_preprocessing(x)) human_summaries = short_summary['short'].astype(str).values.tolist() complete_news = actual_news['long'].astype(str).values.tolist() return human_summaries, complete_news if __name__ == '__main__': # Read the excel file human_summaries, complete_news = read_excel() # This list will hold the generated summaries output_summaries = [] for news in complete_news[:10]: text, summary = summarization(news, 'english', 1) output_summaries.append(summary[0].text) # Evaluate the first N summaries with ROUGE output_summaries = [text for text in output_summaries[:10]] human_summaries = [text for text in human_summaries[:10]] # Run ROUGE evaluation metric print(rouge(output_summaries, human_summaries))
target = [] for batch_idx_list in test_batch_sampler: user_list, item_list, review_input_list, _, real_review_list = test_dataset.get_batch( batch_idx_list) sample_idx_list, _ = model._sample_text_by_top_one( user_list, item_list, review_input_list) #convertWord(sample_idx_list, word_dict) ref = tensorToScalar(real_review_list).tolist() try: bleu_score = compute_bleu([sample_idx_list], [ref]) bleu_list_1.append(bleu_score[1]) bleu_list_2.append(bleu_score[2]) bleu_list_3.append(bleu_score[3]) bleu_list_4.append(bleu_score[4]) rouge_score = rouge([sample_idx_list], ref) rouge_1_list.append(rouge_score[0]) rouge_2_list.append(rouge_score[1]) rouge_L_list.append(rouge_score[2]) except: pass count += 1 if count % 50 == 0: import sys sys.exit(0) t1 = time() print('Generating %d lines, test samples cost:%.4fs' % (count, (t1 - t0))) print('bleu_1:%.4f' % np.mean(bleu_list_1)) print('bleu_2:%.4f' % np.mean(bleu_list_2))
content_file = "/tmp/rouge-model%s.txt" % d config_file = "/tmp/rouge-config%s.txt" % d if d: content = [] count = 0 with open(path.join(dataset, d, "content.txt.nrm")) as f: lines = f.readlines() for line in lines: line = line.strip() n = len(line.strip().split(" ")) if n > MIN_WORDS: content.append(line) count += n if count > MAX_WORDS: break with open(content_file, "w") as w: w.write("\n".join(content)) for s in listdir(path.join(dataset, d, "sources")): if s.endswith(".nrm"): with open(config_file, "w") as w: w.write("%s %s" % (path.join(dataset, d, "sources", s), content_file)) rouge( MAX_WORDS, config_file, path.join(dataset, d, "sources", ".".join(s.split(".")[:-1]) + ".rge")) os.system("rm %s" % content_file) os.system("rm %s" % config_file)
def valid_model(in_path, da_path, sum_path,sess): #return accuracy for dialogue act, rouge-1,2,3,L for summary #some useful items are also calculated #da_outputs, correct_das: predicted / ground truth of dialogue act rouge_1 = [] rouge_2 = [] rouge_3 = [] rouge_L = [] da_outputs = [] correct_das = [] data_processor_valid = DataProcessor(in_path, da_path, sum_path, in_vocab, da_vocab) while True: #get a batch of data in_data, da_data, da_weight, length, sums, sum_weight,sum_lengths, in_seq, da_seq, sum_seq = data_processor_valid.get_batch(batch_size) feed_dict = {input_data.name: in_data, sequence_length.name: length, sum_length.name: sum_lengths} if data_processor_valid.end != 1 or in_data: ret = sess.run(inference_outputs, feed_dict) #summary part pred_sums = [] correct_sums = [] for batch in ret[1]: tmp = [] for time_i in batch: tmp.append(np.argmax(time_i)) pred_sums.append(tmp) for i in sums: correct_sums.append(i.tolist()) for pred,corr in zip(pred_sums,correct_sums): rouge_score_map = rouge.rouge(pred,corr) rouge1 = 100*rouge_score_map['rouge_1/f_score'] rouge2 = 100*rouge_score_map['rouge_2/f_score'] rouge3 = 100*rouge_score_map['rouge_3/f_score'] rougeL = 100*rouge_score_map['rouge_l/f_score'] rouge_1.append(rouge1) rouge_2.append(rouge2) rouge_3.append(rouge3) rouge_L.append(rougeL) #dialogue act part pred_das = ret[0].reshape((da_data.shape[0], da_data.shape[1], -1)) for p, t, i, l in zip(pred_das, da_data, in_data, length): p = np.argmax(p, 1) tmp_pred = [] tmp_correct = [] for j in range(l): tmp_pred.append(da_vocab['rev'][p[j]]) tmp_correct.append(da_vocab['rev'][t[j]]) da_outputs.append(tmp_pred) correct_das.append(tmp_correct) if data_processor_valid.end == 1: break precision = computeAccuracy(correct_das, da_outputs) logging.info('da precision: ' + str(precision)) logging.info('sum rouge1: ' + str(np.mean(rouge_1))) logging.info('sum rouge2: ' + str(np.mean(rouge_2))) logging.info('sum rouge3: ' + str(np.mean(rouge_3))) logging.info('sum rougeL: ' + str(np.mean(rouge_L))) data_processor_valid.close() return np.mean(rouge_1),np.mean(rouge_2),np.mean(rouge_3),np.mean(rouge_L),precision
def evaluate(test_dataset, test_batch_sampler, model, review_aspect_mask): model.eval() bleu_score = [] bleu_list_1, bleu_list_2, bleu_list_3, bleu_list_4 = [], [], [], [] rouge_1_list, rouge_2_list, rouge_L_list = [], [], [] hyp_ref_list = [] for batch_idx_list in test_batch_sampler: #user, item, review_input, summary, real_review = \ # test_dataset.get_batch(batch_idx_list) #sample_idx_list = \ # model._sample_text_by_top_one(user, item, summary, review_input, \ # review_aspect, review_aspect_mask) user, item, review_input, summary, real_review = test_dataset.get_batch(batch_idx_list) sample_idx_list = model._sample_text_by_top_one(user, item, review_input, review_aspect_mask) #import pdb; pdb.set_trace() #for record_idx, hyp in enumerate(tensorToScalar(sample_idx_list)): for record_idx, hyp in enumerate(sample_idx_list): hyp = tensorToScalar(hyp).tolist() for clip_idx, word in enumerate(hyp): if word == PAD: # if current word is the last word of hyp if clip_idx + 1 == len(hyp): clip_idx = clip_idx - 1 break # if next word also the PAD elif hyp[clip_idx + 1] == PAD: clip_idx = clip_idx - 1 break hyp = hyp[:clip_idx+1] #import pdb; pdb.set_trace() ref = tensorToScalar(real_review[record_idx]).tolist() for clip_idx, word in enumerate(ref): if word == PAD: # if current word is the last word of ref if clip_idx + 1 == len(ref): clip_idx = clip_idx - 1 break # if next word also the PAD elif ref[clip_idx + 1] == PAD: clip_idx = clip_idx - 1 break ref = ref[:clip_idx+1] if len(ref) != 0: hyp_ref_list.append([hyp, [ref]]) #import pdb; pdb.set_trace() for record_idx, [hyp, ref] in enumerate(hyp_ref_list): try: bleu_score = compute_bleu([hyp], [ref]) bleu_list_1.append(bleu_score[1]) bleu_list_2.append(bleu_score[2]) bleu_list_3.append(bleu_score[3]) bleu_list_4.append(bleu_score[4]) rouge_score = rouge([hyp], ref) rouge_1_list.append(rouge_score[0]) rouge_2_list.append(rouge_score[1]) rouge_L_list.append(rouge_score[2]) except: pass #import pdb; pdb.set_trace() print('bleu_1:%.4f' % np.mean(bleu_list_1)) print('bleu_2:%.4f' % np.mean(bleu_list_2)) print('bleu_3:%.4f' % np.mean(bleu_list_3)) print('bleu_4:%.4f' % np.mean(bleu_list_4)) print('rouge_1_f:%.4f' % np.mean(rouge_1_list)) print('rouge_2_f:%.4f' % np.mean(rouge_2_list)) print('rouge_L_f:%.4f' % np.mean(rouge_L_list)) return np.mean(bleu_list_4), np.mean(rouge_L_list)
max_score = score best = s if best: base = ".".join(best.split(".")[:-1]) s = base + ".best" + ver text_path = os.path.join(sources, base + ".nrm") cont_path = os.path.join(dataset, dp, "content.txt.nrm") if os.path.exists(os.path.join(sources, s)) and os.path.exists(text_path): if rge.parse_rouge(os.path.join(sources, base + ".rge"), 2) < min_rge: continue can_text = candidate(text_path) ref_text = "\n".join([ x for x in utils.fileaslist(cont_path) if len(x.split(" ")) > 3 ][:FIRST_N_LINES]) can_path = "/tmp/mds/%s.can.txt" % base ref_path = "/tmp/mds/%s.ref.txt" % base utils.write2file(can_text, can_path) utils.write2file(ref_text, ref_path) eval_writer.write("%s %s\n" % (can_path, ref_path)) eval_writer.close() print "created the evaluation file, running rouge..." os.chdir(rouge_dir) rge.rouge(1000, eval_path, eval_out) print "done."
import sys import glob import rouge import bleu import pandas as pd f1 = sys.argv[1] #decoded f2 = sys.argv[2] #reference ref = [] decoded = [] for i, j in zip(sorted(glob.glob(f1 + '*.txt')), sorted(glob.glob(f2 + '*.txt'))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) decoded.append(dec_tex) data = {'decoded': decoded, 'reference': ref} df = pd.DataFrame(data) df.to_csv('analysis.csv', index=False) bl = bleu.moses_multi_bleu(decoded, ref) x = rouge.rouge(decoded, ref) print('%.2f\t%.2f\t%.2f\t%.2f' % (bl, x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100, x['rouge_l/f_score'] * 100))
def reward_function(self, reference, summary, measure='rouge_l/f_score'): if 'rouge' in measure: return rouge([summary],[reference])[measure] else: return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
+ 'Bleu_2: ' + str(bleu_2) + '\t' \ + 'Bleu_3: ' + str(bleu_3) + '\t' \ + 'Bleu_4: ' + str(bleu_4) print(log_str) if (args.save): #Save evaluation results in a log file with open(os.path.join(args.save_path, 'log.txt'), 'a') as f: f.write(log_str + '\n') if (args.rouge): reference_corpus = [ " ".join(reference) for reference in reference_corpus ] translation_corpus = [ " ".join(hypothesis) for hypothesis in translation_corpus ] score = rouge(translation_corpus, reference_corpus) print(score["rouge_l/f_score"]) log_str = 'Rouge Evaluation: ' + '\t' print(log_str) if (args.save): #Save evaluation results in a log file with open(os.path.join(args.save_path, 'log.txt'), 'a') as f: f.write(log_str + '\n')
def main(_): vocab = load_vocabulary(FLAGS.data_dir) if FLAGS.generating: data_reader = DataReader(FLAGS.data_dir, n_reviews=5, generating=True) else: data_reader = DataReader(FLAGS.data_dir) model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items, global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors, img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim, lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate) saver = tf.compat.v1.train.Saver(max_to_keep=10) log_file = open('log.txt', 'w') test_step = 0 config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, FLAGS.ckpt_dir) print('Model succesfully restored') # Testing review_gen_corpus = defaultdict(list) review_ref_corpus = defaultdict(list) photo_bleu_scores = defaultdict(list) photo_rouge_scores = defaultdict(list) review_bleu_scores = defaultdict(list) review_rouge_scores = defaultdict(list) sess.run(model.init_metrics) for users, items, ratings in data_reader.read_real_test_set(FLAGS.batch_size, rating_only=True): test_step += 1 fd = model.feed_dict(users, items, ratings) sess.run(model.update_metrics, feed_dict=fd) review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.real_test_review) img_idx = [data_reader.real_test_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.real_test_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images) _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd) gen_reviews = decode_reviews(_reviews, vocab) ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews] if FLAGS.generating: for gen, ref in zip(gen_reviews, ref_reviews): gen_str = "GENERATED:\n"+" ".join(gen) ref_str = "REFERENCE:\n"+" ".join([" ".join(sentence) for sentence in ref])+"\n" log_info(log_file,gen_str) log_info(log_file,ref_str) for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews): review_gen_corpus[(user, item)].append(gen) review_ref_corpus[(user, item)] += refs bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): photo_bleu_scores[order].append(score) rouge_scores = rouge([gen], refs) for metric, score in rouge_scores.items(): photo_rouge_scores[metric].append(score) _mae, _rmse = sess.run([model.mae, model.rmse]) log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse)) log_info(log_file, '\nReview generation results:') log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100, np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100)) for user_item, gen_reviews in review_gen_corpus.items(): references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])] user_item_bleu_scores = defaultdict(list) for gen in gen_reviews: bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): user_item_bleu_scores[order].append(score) for order, scores in user_item_bleu_scores.items(): review_bleu_scores[order].append(np.array(scores).mean()) user_item_rouge_scores = defaultdict(list) for gen in gen_reviews: rouge_scores = rouge([gen], references) for metric, score in rouge_scores.items(): user_item_rouge_scores[metric].append(score) for metric, scores in user_item_rouge_scores.items(): review_rouge_scores[metric].append(np.array(scores).mean()) log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100, np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100)) for metric in ['rouge_1', 'rouge_2', 'rouge_l']: log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
def reward_function(reference, summary, measure='rouge_l/f_score'): if 'rouge' in measure: return rouge([summary], [reference])[measure]
def main(_): vocab = load_vocabulary(FLAGS.data_dir) data_reader = DataReader(FLAGS.data_dir) model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items, global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors, img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim, lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate) update_rating, update_review, global_step = train_fn(model) saver = tf.compat.v1.train.Saver(max_to_keep=10) log_file = open('log.txt', 'w') test_step = 0 config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, FLAGS.num_epochs + 1): log_info(log_file, "\nEpoch: {}/{}".format(epoch, FLAGS.num_epochs)) count = 0 sum_rating_loss = 0 sum_review_loss = 0 # Training for users, items, ratings in data_reader.read_train_set(FLAGS.batch_size, rating_only=True): count += 1 fd = model.feed_dict(users=users, items=items, ratings=ratings, is_training=True) _step, _, _rating_loss = sess.run([global_step, update_rating, model.rating_loss], feed_dict=fd) sum_rating_loss += _rating_loss review_users, review_items, _, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.train_review) img_idx = [data_reader.train_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.train_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images, reviews=reviews, is_training=True) _, _review_loss = sess.run([update_review, model.review_loss], feed_dict=fd) sum_review_loss += _review_loss if _step % FLAGS.display_step == 0: data_reader.iter.set_postfix(rating_loss=(sum_rating_loss / count), review_loss=(sum_review_loss / count)) # Testing review_gen_corpus = defaultdict(list) review_ref_corpus = defaultdict(list) photo_bleu_scores = defaultdict(list) photo_rouge_scores = defaultdict(list) review_bleu_scores = defaultdict(list) review_rouge_scores = defaultdict(list) sess.run(model.init_metrics) for users, items, ratings in data_reader.read_test_set(FLAGS.batch_size, rating_only=True): test_step += 1 fd = model.feed_dict(users, items, ratings) sess.run(model.update_metrics, feed_dict=fd) review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.test_review) img_idx = [data_reader.test_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.test_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images) _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd) gen_reviews = decode_reviews(_reviews, vocab) ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews] for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews): review_gen_corpus[(user, item)].append(gen) review_ref_corpus[(user, item)] += refs bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): photo_bleu_scores[order].append(score) rouge_scores = rouge([gen], refs) for metric, score in rouge_scores.items(): photo_rouge_scores[metric].append(score) _mae, _rmse = sess.run([model.mae, model.rmse]) log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse)) log_info(log_file, '\nReview generation results:') log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100, np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100)) for user_item, gen_reviews in review_gen_corpus.items(): references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])] user_item_bleu_scores = defaultdict(list) for gen in gen_reviews: bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): user_item_bleu_scores[order].append(score) for order, scores in user_item_bleu_scores.items(): review_bleu_scores[order].append(np.array(scores).mean()) user_item_rouge_scores = defaultdict(list) for gen in gen_reviews: rouge_scores = rouge([gen], references) for metric, score in rouge_scores.items(): user_item_rouge_scores[metric].append(score) for metric, scores in user_item_rouge_scores.items(): review_rouge_scores[metric].append(np.array(scores).mean()) log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100, np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100)) for metric in ['rouge_1', 'rouge_2', 'rouge_l']: log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) save_path = saver.save(sess, f"tmp/model{epoch}.ckpt") log_info(log_file, '')