class EvaluateNL(): def __init__(self): self.eval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=[ 'EmbeddingAverageCosineSimilairty', 'VectorExtremaCosineSimilarity', 'GreedyMatchingScore' ]) def compute(self, refs, hyps): data = [] for i, ref in enumerate(refs): ref = ref.replace('\n', '') hyp = hyps[i].replace('\n', '') if not ref: continue scores = self.eval.compute_individual_metrics(ref=[ref], hyp=hyp) scores = sorted(scores.items()) self._metrics = [s[0] for s in scores] #data.append([ref, hyp]) data.append([ ref, hyp, *[ str(float('%0.6f' % (s[1]))).replace('.', ',') for s in scores ] ]) return pd.DataFrame(data, columns=['Reference', 'Hypotesi', *self._metrics])
def get_all_nlgeval_metrics(complex_sentence, simple_sentence): if 'NLGEVAL' not in globals(): global NLGEVAL print('Loading NLGEval models...') # Change False to True if you want to use skipthought or glove NLGEVAL = NLGEval(no_skipthoughts=True, no_glove=True) print('Done.') return NLGEVAL.compute_individual_metrics([complex_sentence], simple_sentence)
def test_compute_metrics_empty(self): n = NLGEval() # One of the ref is empty scores = n.compute_individual_metrics(ref=["this is a test", ""], hyp="this is a good test") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5) self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilarity'], places=5) self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty']) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) self.assertEqual(12, len(scores)) # Empty hyp scores = n.compute_individual_metrics(ref=["this is a good test"], hyp="") self.assertAlmostEqual(0, scores['Bleu_1'], places=5) self.assertAlmostEqual(0, scores['Bleu_2'], places=5) self.assertAlmostEqual(0, scores['Bleu_3'], places=5) self.assertAlmostEqual(0, scores['Bleu_4'], places=5) self.assertAlmostEqual(0, scores['METEOR'], places=5) self.assertAlmostEqual(0, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0, scores['CIDEr'], places=5) self.assertAlmostEqual(0, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0, scores['EmbeddingAverageCosineSimilarity'], places=5) self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty']) self.assertAlmostEqual(0, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0, scores['GreedyMatchingScore'], places=5) self.assertEqual(12, len(scores))
def test_compute_metrics_omit(self): n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilarity']) # Individual Metrics scores = n.compute_individual_metrics(ref=["this is a test", "this is also a test"], hyp="this is a good test") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) self.assertEqual(7, len(scores))
def main(args): nlgeval = NLGEval(no_skipthoughts=True, no_glove=True) samples = {} with open(args.gen_file) as f: for line in tqdm(f): hypo, refs = line.rstrip().split('\t') metrics_dict = nlgeval.compute_individual_metrics( refs.split('*#'), hypo) samples[(hypo, refs)] = metrics_dict['Bleu_4'] for hypo, refs in sorted(samples.keys(), key=samples.__getitem__)[:args.num_samples]: print('BLEU:', samples[(hypo, refs)]) print('H:', hypo) for r in refs.split('*#'): print('R:', r) print('---')
def test_oo_api(): with open("examples/hyp.txt") as f: hyp = f.readlines() hyp = [x.strip() for x in hyp] with open("examples/ref1.txt") as f: ref1 = f.readlines() ref1 = [x.strip() for x in ref1] with open("examples/ref2.txt") as f: ref2 = f.readlines() ref2 = [x.strip() for x in ref2] nlge = NLGEval() res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0]) res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1]) hyp_list = hyp ref_list = [ref1, ref2] res = nlge.compute_metrics(ref_list, hyp_list)
def bleu(model_name, mode, num_samples): sampled_file_dst = [] for i in range(num_samples): sampled_file_dst.append( f'logs/{model_name}/samples/sampled_{mode}_{i}.txt') source_file_dst = f'logs/{model_name}/samples/source_{mode}.txt' sampled = [ list(np.loadtxt(sampled_file_dst[i], dtype='U', delimiter='\n')) for i in range(num_samples) ] source = list(np.loadtxt(source_file_dst, dtype='U', delimiter='\n')) nlgeval = NLGEval( metrics_to_omit=['METEOR', 'ROUGE_L', 'CIDEr', 'SkipThoughtCS']) best_bleu = [] print(len(source), len(sampled), [len(sampled[i]) for i in range(len(sampled))]) for i in range(len(source)): curr_best = ('', 0.0) for j in range(num_samples): score = nlgeval.compute_individual_metrics([source[i]], sampled[j][i])['Bleu_4'] if score > curr_best[1]: curr_best = (sampled[j][i], score) if i % 100 == 0: print(f'Sentence pair {i}.{j}:') print('source : ', source[i]) print('sampled : ', curr_best[0]) print('score : ', curr_best[1]) print('\n') best_bleu.append(curr_best[0]) np.savetxt(f'logs/{model_name}/samples/best_bleu_{mode}', np.array(best_bleu), delimiter='\n', fmt='%s')
countGenUsed = 0 countRefUsed = 0 for id in srcSen: countRefUsed += 1 if id not in gen: continue; hypo = gen[id] r = ref[id] countGenUsed += len(hypo) if verbose: print('Computing sentence ' + str(countRefUsed)) for h in hypo: metrics_dict = nlgeval.compute_individual_metrics(r, h) for m in metrics: results[m] += metrics_dict[m] resultsSolo[m].append(metrics_dict[m]) if verbose: print('Parsed questions: ' + str(countGenUsed)) if verbose: print('Writting files to ' + name + '...') for m in metrics: with open(name+'/'+m, 'w') as f: f.write(m+'\t\t\t\n') f.write(str(results[m]/countGenUsed)+'\t\t\t\n') f.write(m+'\t\t\t\n')
def normalize(target_folder, remove_sim, too_sim_threshold, result_folder): data_dict = defaultdict(lambda: defaultdict(dict)) output_dict = defaultdict(list) FILE_TEST = [i for i in os.listdir(target_folder) if ".jsonl" in i] # {'answers': ['a'], 'options': [['a','b']], 'questions': ['q1'], 'article': "", 'id': 'middle2572.txt'} print(FILE_TEST) print("====size====") # count total size of each prediction for FILE in FILE_TEST: with open(os.path.join(target_folder, FILE), 'r', encoding='utf8') as jsonlfile: for jlines in jsonlfile.readlines(): jfile = json.loads(jlines) for q in jfile['questions']: dict_id = jfile['article'].strip() + q.strip() dict_id = dict_id.replace(" ", "").lower() data_dict[dict_id][FILE] = jfile for _, testfiles in data_dict.items(): if len(testfiles) == len(FILE_TEST): for fname, fcontent in testfiles.items(): output_dict[fname].append(fcontent) print("Total", len(data_dict), len(output_dict)) print("====similarity====") from nlgeval import NLGEval n = NLGEval(metrics_to_omit=[ 'METEOR', 'EmbeddingAverageCosineSimilairty', 'SkipThoughtCS', 'VectorExtremaCosineSimilarity', 'GreedyMatchingScore', 'CIDEr' ]) for task, datas in output_dict.items(): overall_dict = defaultdict(list) toosim_dict = defaultdict(list) overall_result = dict() toosim_result = dict() for v in datas: if len(v['options'][0]) <= 4: # {'Bleu_1': 0.19999999996000023, 'Bleu_2': 7.071067810274489e-09, 'Bleu_3': 2.5543647739782087e-11, 'Bleu_4': 1.699044244302013e-12, 'METEOR': 0.0547945205479452, 'ROUGE_L': 0.26180257510729615, 'CIDEr': 0.0, 'SkipThoughtCS': 0.41264296, 'EmbeddingAverageCosineSimilairty': 0.804388, 'VectorExtremaCosineSimilarity': 0.650115, 'GreedyMatchingScore': 0.655746} example_dict = defaultdict(list) if "two" in target_folder: # answer with one option - check answer copying problem opt = v['options'][0] # [opt[1]] - ground truth/answer metrics_dict = n.compute_individual_metrics([opt[1]], opt[0]) for mk, mv in metrics_dict.items(): if np.max(mv) > too_sim_threshold: toosim_dict[mk].append(1) if remove_sim: if v in datas: del output_dict[task][datas.index(v)] break overall_dict[mk].append(mv) else: for i in set(it.combinations(v['options'][0], 2)): if len(i[0]) == 0 or len(i[1]) == 0: continue metrics_dict = n.compute_individual_metrics([i[0]], i[1]) for mk, mv in metrics_dict.items(): example_dict[mk].append(mv) for mk, mv in example_dict.items(): if np.max(mv) > too_sim_threshold: toosim_dict[mk].append(1) if remove_sim: if v in datas: del output_dict[task][datas.index(v)] break overall_dict[mk].append(np.mean(mv)) for mk, mv in overall_dict.items(): overall_result[mk] = np.mean(mv) for mk, mv in toosim_dict.items(): toosim_result[mk] = np.sum(mv) print(task, overall_result, "\nToo Sim: ", toosim_result) data_dict = defaultdict(lambda: defaultdict(dict)) for FILE, datas in output_dict.items(): for data in datas: for q in data['questions']: dict_id = data['article'].strip() + q.strip() dict_id = dict_id.replace(" ", "").lower() data_dict[dict_id][FILE] = data normalized_dict = defaultdict(list) print("Total", len(data_dict)) for _, testfiles in data_dict.items(): if len(testfiles) == len(FILE_TEST): for fname, fcontent in testfiles.items(): normalized_dict[fname].append(fcontent) print("====output====") for f, clist in normalized_dict.items(): print("Normalized", f, len(clist)) with jsonlines.open(os.path.join(result_folder, f), mode='w') as writer: writer.write_all(clist)
def main(args): good_hypos = load_texts(args.good_gen, False) bad_hypos = load_texts(args.bad_gen, False) refs = load_texts(args.ref_file, True) with open(args.dataset_file) as f: data_in = json.load(f) if args.webnlg: triples = [] for entry in tqdm(data_in['entries']): for e in entry.values(): triples.append(generate_triples_webnlg( e["modifiedtripleset"])) else: triples = [] for article in tqdm(data_in): triples.append(generate_triples_agenda(article["relations"])) if not args.ignore_isolated: for entity in article["entities"]: triples[-1].append((entity, "", entity)) nlgeval = NLGEval( metrics_to_omit=[ 'METEOR', 'ROUGE_L', 'CIDEr' ], no_skipthoughts=True, no_glove=True ) interesting_samples = [] for i, (gh, bh, ref_list, tripset) in tqdm(list( enumerate(zip(good_hypos, bad_hypos, refs, triples)))): metrics_dict = nlgeval.compute_individual_metrics(ref_list, gh) good_bleu = metrics_dict['Bleu_4'] metrics_dict = nlgeval.compute_individual_metrics(ref_list, bh) bad_bleu = metrics_dict['Bleu_4'] graph = generate_graph(tripset) num_ccs = nx.number_connected_components(graph) num_nodes = len(graph) mean_cc_size = num_nodes / num_ccs diameters = [] for subgraph in [graph.subgraph(cc).copy() for cc in nx.connected_components(graph)]: diameters.append(nx.diameter(subgraph)) max_diameter = max(diameters) good_rep_rate = compute_repetition_rate(gh)[-1] bad_rep_rate = compute_repetition_rate(bh)[-1] if (good_bleu > bad_bleu) and\ (args.min_mean_cc_size <= mean_cc_size < args.max_mean_cc_size) and\ (args.min_max_diameter <= max_diameter < args.max_max_diameter) and\ (good_rep_rate < bad_rep_rate): interesting_samples.append( (i, good_bleu, bad_bleu, good_rep_rate, bad_rep_rate, mean_cc_size, max_diameter)) for i, gb, bb, grep, brep, mean_cc, max_dia in sorted( interesting_samples, key=lambda x: x[3])[:args.num_samples]: print('Sample #', i) print('Good BLEU:', gb, 'Bad BLEU:', bb, 'Difference:', gb-bb) print('Good RepRate:', grep, 'Bad RepRate:', brep) print('Mean CC size:', mean_cc, 'Max diameter:', max_dia) print('Good Hypo:', good_hypos[i]) print('Bad Hypo:', bad_hypos[i]) for r in refs[i]: print('R:', r) print('---')
class Metrics_Calculator(object): def __init__(self,hparams,glove_comparer): super(Metrics_Calculator, self).__init__() self.nlg_eval = NLGEval(metrics_to_omit=['EmbeddingAverageCosineSimilairty', 'EmbeddingAverageCosineSimilarity','GreedyMatchingScore','SkipThoughtCS','VectorExtremaCosineSimilarity']) self.list_dict_track = {"data":[]} self.hparams = hparams self.glove_comparer = glove_comparer def build_json_results(self, context, generated_question_list, target_question_list, row_mean_metrics): """ Cria json para cada linha que será salvo para monitorar as métricas em self.list_dict_track """ new_info = {} new_info["context"] =context new_info["generated_question_list"] =generated_question_list new_info["target_question_list"] =target_question_list new_info["row_mean_metrics"] =row_mean_metrics return new_info def track_metrics_row(self,original_target,gen_target_options_list): """ Calcula as métricas para cada par question-context """ bleu_1_list = [] bleu_2_list = [] bleu_3_list = [] bleu_4_list = [] CIDEr_list = [] ROUGE_L_list = [] cossine_similarity_list = [] for gen_target_option in gen_target_options_list: metrics_dict = self.nlg_eval.compute_individual_metrics(ref=[original_target],hyp=gen_target_option)#ref:List[str] , hyp:str bleu_1_list.append(metrics_dict['Bleu_1']) bleu_2_list.append(metrics_dict['Bleu_2']) bleu_3_list.append(metrics_dict['Bleu_3']) bleu_4_list.append(metrics_dict['Bleu_4']) CIDEr_list.append(metrics_dict['CIDEr']) ROUGE_L_list.append(metrics_dict['ROUGE_L']) cs = self.glove_comparer.compare_sentences_with_cossine_similarity(original_target,gen_target_option) cossine_similarity_list.append(cs) row_metrics_dict = {"Bleu_1":np.mean(bleu_1_list), "Bleu_2":np.mean(bleu_2_list), "Bleu_3":np.mean(bleu_3_list), "Bleu_4":np.mean(bleu_4_list), "CIDEr":np.mean(CIDEr_list), "ROUGE_L":np.mean(ROUGE_L_list), "Glove_Cossine_Similarity":np.mean(cossine_similarity_list)} return row_metrics_dict def generate_sentences_and_track_metrics_batch(self,logits,original_targets_batch,original_sources_batch,save_track_dict=False): """ Calcula métricas para todo o batch """ batch_bleu_1_list = [] batch_bleu_2_list = [] batch_bleu_3_list = [] batch_bleu_4_list = [] batch_CIDEr_list = [] batch_ROUGE_L_list = [] batch_Glove_Cossine_Similarity_list = [] #batch for i,(original_target,original_source) in enumerate(zip(original_targets_batch,original_sources_batch)): #linha relevant_logits = logits[i*self.hparams.num_gen_sentences:self.hparams.num_gen_sentences+i*self.hparams.num_gen_sentences] gen_target_options_list = [self.hparams.tokenizer.decode(l, skip_special_tokens=True) for l in relevant_logits] row_metrics_dict = self.track_metrics_row(original_target=original_target,gen_target_options_list=gen_target_options_list) if save_track_dict: self.list_dict_track["data"].append(self.build_json_results(context=original_source, generated_question_list=gen_target_options_list, target_question_list=original_target, row_mean_metrics = row_metrics_dict)) batch_bleu_1_list.append(row_metrics_dict['Bleu_1']) batch_bleu_2_list.append(row_metrics_dict['Bleu_2']) batch_bleu_3_list.append(row_metrics_dict['Bleu_3']) batch_bleu_4_list.append(row_metrics_dict['Bleu_4']) batch_CIDEr_list.append(row_metrics_dict['CIDEr']) batch_ROUGE_L_list.append(row_metrics_dict['ROUGE_L']) batch_Glove_Cossine_Similarity_list.append(row_metrics_dict['Glove_Cossine_Similarity']) batch_metrics_dict = {"Batch_Bleu_1":np.mean(batch_bleu_1_list), "Batch_Bleu_2":np.mean(batch_bleu_2_list), "Batch_Bleu_3":np.mean(batch_bleu_3_list), "Batch_Bleu_4":np.mean(batch_bleu_4_list), "Batch_CIDEr":np.mean(batch_CIDEr_list), "Batch_ROUGE_L":np.mean(batch_ROUGE_L_list), "Batch_Glove_Cossine_Similarity":np.mean(batch_Glove_Cossine_Similarity_list) } return batch_metrics_dict
def gen_compare(cider_in, table_in, summary_ours_in, summary_with_unk, summary_gold_in, file_out): ''' gen file for error case compare ''' with open(table_in) as f: table = f.readlines() with open(summary_ours_in) as f: res_ours = f.readlines() with open(summary_with_unk) as f: res_ours_unk = f.readlines() with open(summary_gold_in) as f: res_gold = f.readlines() with open(cider_in) as f: res_cider = json.load(f) score_cider = res_cider["CIDEr"] score_cider_d = res_cider["CIDErD"] print(len(score_cider_d)) nlgeval = NLGEval() out = open(file_out, "w") i = 0 num_write = 0 avg_len_right = 0.0 avg_len_wrong = 0.0 for this_cider, this_box, this_ours, this_unk, this_gold in zip( score_cider_d, table, res_ours, res_ours_unk, res_gold): references = [this_gold.strip().split()] hypothesis = this_ours.strip().split() this_bleu = sentence_bleu(references, hypothesis) this_cider = float(this_cider) metrics_dict = nlgeval.compute_individual_metrics([this_gold.strip()], this_ours.strip()) this_meteor = metrics_dict["meteor"] i += 1 #if this_bleu < 0.1: #if this_bleu > 0.1 and this_bleu < 0.2: #if this_bleu > 0.2 and this_bleu < 0.3: if this_cider < 1.0: avg_len_wrong += len(this_box.strip().split("\t")) num_write += 1 out.write("########## Test " + str(i) + " ##########\n") for each_token in this_box.strip().split("\t"): out.write(each_token + "\n") out.write("########## Gold ##########\n") out.write(this_gold.strip() + "\n") out.write("########## Ours ##########\n") out.write(this_ours.strip() + "\n") out.write("########## Ours with unk ##########\n") out.write(this_unk.strip() + "\n") out.write("########## bleu ##########\n") out.write(str(this_bleu) + "\n") out.write("########## cider-d ##########\n") out.write(str(this_cider) + "\n") out.write("########## meteor ##########\n") out.write(str(this_meteor) + "\n") out.write("\n") else: avg_len_right += len(this_box.strip().split("\t")) out.close() print("All: ", i) print("Write: ", num_write) print("avg_len_wrong: ", float(avg_len_wrong) / num_write) print("avg_len_right: ", float(avg_len_right) / (i - num_write))
def finetune_model(num_epochs=20, k_fold=10, predict_test=True, calculate_scores=False): model = seq2seq(mode='finetune') saver = tf.train.Saver() tf.logging.set_verbosity(logging.INFO) config = tf.ConfigProto() config.gpu_options.allow_growth = True #Reading data from csv file data = pd.read_csv(model.active_passive_csv_file, usecols=[2, 3]) print(data.head()) data_clean = data.applymap(clean_str) if calculate_scores is True: nlgeval = NLGEval(no_skipthoughts=True, no_glove=False) #bleu_1_scores, bleu_2_scores, bleu_3_scores, bleu_4_scores = [],[],[],[] bleu_1_scores_top_beam, bleu_2_scores_top_beam, bleu_3_scores_top_beam, bleu_4_scores_top_beam = [],[],[],[] meteor_scores_top_beam = [] bleu_4_best_of_10_result_scores, meteor_best_of_10_result_scores = [], [] rouge_scores_top_beam, rouge_scores_best_of_10_result_scores = [], [] greedy_embedding_top_beam, greedy_embedding_best_of_10_result_scores = [],[] ter_top_beam, ter_best_of_10_result_scores_with_best_bleu, ter_best_of_10_result_scores_with_best_meteor = [],[],[] for i in range(k_fold): iter_no = 0 train, test = train_test_split(data_clean, test_size=0.2) print('\nTrain head: ', train.head()) print('\n\nTest head: ', test.head()) next_batch = model.train_batchify_pandas(train) print( '\n\nRetraining again for #{} validations----------------------------------\n\n' .format(i)) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state( os.path.dirname(os.path.join(model.model_dir, model.model_dir))) if ckpt and ckpt.model_checkpoint_path: print("Found checkpoints. Restoring variables..") saver.restore(sess, ckpt.model_checkpoint_path) print("Checkpoints restored...") for epoch in range(num_epochs): train_cost = 0 train_op = model.get_train_op(epoch) print('Training op --- ', train_op) for _ in range(model.samples // model.batch_size): iter_no = iter_no + 1 batch = next_batch() feed = batch feed[model.iter_no] = iter_no v = sess.run([train_op, model.loss, model.global_step], batch) train_cost += v[1] print('Global Step: ', v[2], ', Step Loss: ', v[1]) # print(v[3][-1][-1]) train_cost /= model.samples / model.batch_size print("Epoch", (epoch + 1), train_cost) print("Saving model at " + str(v[2]) + " iterations...") saver.save(sess, os.path.join(model.new_dir, 'model.ckpt'), global_step=model.global_step) print("Model saved..") #Running for test split and compute BLEU scores test_next_batch = model.train_batchify_pandas(test) hyp_top_beam_result, hyp_best_of_10_beam_result, ref = [], [], [] hyp_best_of_10_meteor_result, hyp_best_of_10_rouge_result = [], [] hyp_best_of_10_greedy_emb_result = [] # hyp, ref = [],[] print("Predictions of test batch: \n") for _ in range(len(test) // model.batch_size): test_batch = test_next_batch() temp_ref = [ model.to_str(string).replace('</S>', '').replace('<S>', '').strip() for string in test_batch[model.outputs] ] ref.extend(temp_ref) v = sess.run([model.global_step, model.predictions], test_batch) for i, pred in enumerate(v[1]['preds']): max_bleu = 0.0 max_meteor = 0.0 max_rouge = 0.0 max_greedy_emb = 0.0 best_bleu_str = '' best_meteor_str = '' best_rouge_str = '' best_greedy_emb_str = '' for j in range(model.beam_width): predicted = model.to_str(pred[:, j]).replace( '</S>', '').replace('<S>', '').strip() if j == 0: hyp_top_beam_result.append(predicted) metrics_dict = nlgeval.compute_individual_metrics( [temp_ref[i]], predicted) if metrics_dict['Bleu_4'] >= max_bleu: max_bleu = metrics_dict['Bleu_4'] best_bleu_str = predicted if metrics_dict['METEOR'] >= max_meteor: max_meteor = metrics_dict['METEOR'] best_meteor_str = predicted if metrics_dict['ROUGE_L'] >= max_rouge: max_rouge = metrics_dict['ROUGE_L'] best_rouge_str = predicted if metrics_dict[ 'GreedyMatchingScore'] >= max_greedy_emb: max_greedy_emb = metrics_dict[ 'GreedyMatchingScore'] best_greedy_emb_str = predicted hyp_best_of_10_beam_result.append(best_bleu_str) hyp_best_of_10_meteor_result.append(best_meteor_str) hyp_best_of_10_rouge_result.append(best_rouge_str) hyp_best_of_10_greedy_emb_result.append( best_greedy_emb_str) """ predicted = model.to_str(seq).replace('</S>','').replace('<S>','').strip() # print(i,predicted) hyp.append(predicted) """ # print(ref,hyp) print('Ref len: ', len(ref), ', Hyp len: ', len(hyp_top_beam_result)) metrics_dict_top_beam_result = nlgeval.compute_metrics( [ref], hyp_top_beam_result) metrics_dict_best_of_5_beam = nlgeval.compute_metrics( [ref], hyp_best_of_10_beam_result) metrics_dict_best_of_5_meteor = nlgeval.compute_metrics( [ref], hyp_best_of_10_meteor_result) metrics_dict_best_of_5_rouge = nlgeval.compute_metrics( [ref], hyp_best_of_10_rouge_result) metrics_dict_best_of_5_greedy_emb = nlgeval.compute_metrics( [ref], hyp_best_of_10_greedy_emb_result) ter_top = calculate_ter(ref, hyp_top_beam_result) ter_bleu = calculate_ter(ref, hyp_best_of_10_beam_result) ter_meteor = calculate_ter(ref, hyp_best_of_10_meteor_result) print("BLEU 1-4 : ", metrics_dict_top_beam_result['Bleu_1'], metrics_dict_top_beam_result['Bleu_2'], metrics_dict_top_beam_result['Bleu_3'], metrics_dict_top_beam_result['Bleu_4']) print("METEOR, ROUGE_L, GreedyEmbedding : ", metrics_dict_top_beam_result['METEOR'], metrics_dict_top_beam_result['ROUGE_L'], metrics_dict_top_beam_result['GreedyMatchingScore']) print( "CIDER, EmbeddingAverageCosineSimilaity, VectorExtremaCosineSimilarity : ", metrics_dict_top_beam_result['CIDEr'], metrics_dict_top_beam_result[ 'EmbeddingAverageCosineSimilairty'], metrics_dict_top_beam_result['VectorExtremaCosineSimilarity']) print("TER top beam, TER Bleu, TER METEOR", ter_top, ter_bleu, ter_meteor) # metrics_dict = nlgeval.compute_metrics([ref],hyp) # Top beam search results bleu_1_scores_top_beam.append( metrics_dict_top_beam_result['Bleu_1']) bleu_2_scores_top_beam.append( metrics_dict_top_beam_result['Bleu_2']) bleu_3_scores_top_beam.append( metrics_dict_top_beam_result['Bleu_3']) bleu_4_scores_top_beam.append( metrics_dict_top_beam_result['Bleu_4']) meteor_scores_top_beam.append( metrics_dict_top_beam_result['METEOR']) rouge_scores_top_beam.append( metrics_dict_top_beam_result['ROUGE_L']) greedy_embedding_top_beam.append( metrics_dict_top_beam_result['GreedyMatchingScore']) ter_top_beam.append(ter_top) # Best of 5 search results ter_best_of_10_result_scores_with_best_bleu.append(ter_bleu) ter_best_of_10_result_scores_with_best_meteor.append(ter_meteor) bleu_4_best_of_10_result_scores.append( metrics_dict_best_of_5_beam['Bleu_4']) meteor_best_of_10_result_scores.append( metrics_dict_best_of_5_meteor['METEOR']) rouge_scores_best_of_10_result_scores.append( metrics_dict_best_of_5_rouge['ROUGE_L']) greedy_embedding_best_of_10_result_scores.append( metrics_dict_best_of_5_greedy_emb['GreedyMatchingScore']) print('Bleu 1 scores top: ', bleu_1_scores_top_beam) print('Bleu 2 scores top: ', bleu_2_scores_top_beam) print('Bleu 3 scores top: ', bleu_3_scores_top_beam) print('Bleu 4 scores top: ', bleu_4_scores_top_beam) print('METEOR scores top: ', meteor_scores_top_beam) print('ROUGE scores top: ', rouge_scores_top_beam) print('Greedy Embedding top: ', greedy_embedding_top_beam) print('Translation Edit Rate top: ', ter_top_beam) print('\n\nBest beam results:---------\n') print('Bleu 4 scores best beam : ', bleu_4_best_of_10_result_scores) print('METEOR scores best beam : ', meteor_best_of_10_result_scores) print('ROUGE scores best beam: ', rouge_scores_best_of_10_result_scores) print('Greedy Embedding best beam: ', greedy_embedding_best_of_10_result_scores) print('Translation Edit Rate Bleu: ', ter_best_of_10_result_scores_with_best_bleu) print('Translation Edit Rate METEOR: ', ter_best_of_10_result_scores_with_best_meteor)
print(len(data_dict_beam)) print(len(data_dict_ori)) print(len(data_dict_random)) n = NLGEval( metrics_to_omit=['METEOR', 'EmbeddingAverageCosineSimilairty', 'SkipThoughtCS', 'VectorExtremaCosineSimilarity', 'GreedyMatchingScore']) # n = NLGEval() print("====similarity between gold and generated distractor====") overall_dict_jaccard = defaultdict(list) overall_result_jaccard = dict() for k, v in data_dict_jaccard.items(): example_dict = defaultdict(list) for i in v['options']: metrics_dict = n.compute_individual_metrics(v["gold_distractor"], i) for mk, mv in metrics_dict.items(): example_dict[mk].append(mv) for mk, mv in example_dict.items(): # if not 0 in mv and not any(n < 0 for n in mv): # overall_dict_jaccard[mk].append(statistics.harmonic_mean(mv)) # else: # overall_dict_jaccard[mk].append(np.mean(mv)) overall_dict_jaccard[mk].append(np.mean(mv)) for mk, mv in overall_dict_jaccard.items(): overall_result_jaccard[mk] = np.mean(mv) print("jaccard", overall_result_jaccard) overall_dict_ori = defaultdict(list) overall_result_ori = dict() for k, v in data_dict_ori.items():
def evaluate(gen, oracle, output, no_score=False): if not no_score: n = NLGEval() print('Start evaluation...') total_samples = oracle.test_size with open(output, 'w') as fout: writer = csv.writer(fout, delimiter='\t') if not no_score: writer.writerow([ 'original (cond)', 'sample (pos)', 'generated (neg)', 'BLEU', 'METEOR' ]) total_bleu = 0 total_meteor = 0 else: writer.writerow( ['original (cond)', 'sample (pos)', 'generated (neg)']) i = 0 end_of_dataset = False while not end_of_dataset: # Retrieve test sample from test set pos, pos_len, cond_id, end_of_dataset = oracle.sample(1, gpu=False) cond, cond_len = oracle.fetch_cond_samples(cond_id, gpu=False) pos, pos_len, cond, cond_len = pos[0], pos_len[0], cond[ 0], cond_len[0] # Generate paraphrase generated = gen.sample_until_end(cond, max_len=100) # Turn to string pos_str = tensor_to_sent(pos, oracle) cond_str = tensor_to_sent(cond, oracle) generated_str = tensor_to_sent(generated, oracle) # Calculate BLEU score if not no_score: if len(generated_str) > 0: scores = n.compute_individual_metrics([cond_str, pos_str], generated_str) bleu = scores['Bleu_2'] meteor = scores['METEOR'] else: bleu = 0 meteor = 0 total_bleu += bleu total_meteor += meteor # Output to tsv file if not no_score: writer.writerow( [cond_str, pos_str, generated_str, bleu, meteor]) else: writer.writerow([cond_str, pos_str, generated_str]) i += 1 if i % int(total_samples / 5) == 0: # Print progress every 5% print('.', end='') sys.stdout.flush() if not no_score: avg_bleu = total_bleu / i avg_meteor = total_meteor / i print( f'Average BLEU score: {avg_bleu}\tAverage METEOR score: {avg_meteor}' )
with open(hyp_file, "r") as f: hyp_dict = { line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:]) for line in f.readlines() } keys = [k for k, v in hyp_dict.items()] labels = [ref_dict[k] for k, _ in hyp_dict.items()] decoded_preds = [v for k, v in hyp_dict.items()] metric = load_metric("bertscore") result_bert = metric.compute( predictions=decoded_preds, references=labels, lang="en", ) nlg = NLGEval() # loads the models print("Key", "\t", "METEOR", "\t", "ROUGE-L") for (key, ref, hyp) in zip(keys, labels, decoded_preds): metrics_dict = nlg.compute_individual_metrics([ref], hyp) print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"]) refs = [[x] for x in labels] metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds) metric = load_metric("rouge") result = metric.compute(predictions=decoded_preds, references=labels) result = {key: value.mid.fmeasure * 100 for key, value in result.items()} print(f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \ {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}")
def test_compute_metrics_oo(self): # Create the object in the test so that it can be garbage collected once the test is done. n = NLGEval() # Individual Metrics scores = n.compute_individual_metrics( ref=["this is a test", "this is also a test"], hyp="this is a good test") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5) self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) self.assertEqual(11, len(scores)) scores = n.compute_metrics( ref_list=[ [ "this is one reference sentence for sentence1", "this is a reference sentence for sentence2 which was generated by your model" ], [ "this is one more reference sentence for sentence1", "this is the second reference sentence for sentence2" ], ], hyp_list=[ "this is the model generated sentence1 which seems good enough", "this is sentence2 which has been generated by your model" ]) self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5) self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.295797, scores['METEOR'], places=5) self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5) self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5) self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) self.assertEqual(11, len(scores)) # Non-ASCII tests. scores = n.compute_individual_metrics( ref=["Test en français.", "Le test en français."], hyp="Le test est en français.") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.0000051, scores['Bleu_3'], places=5) self.assertAlmostEqual(0, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.48372379050300296, scores['METEOR'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.9192341566085815, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5) self.assertEqual(11, len(scores)) scores = n.compute_individual_metrics(ref=["テスト"], hyp="テスト") self.assertAlmostEqual(0.99999999, scores['Bleu_1'], places=5) self.assertAlmostEqual(1.0, scores['METEOR'], places=3) self.assertAlmostEqual(1.0, scores['ROUGE_L'], places=3) self.assertAlmostEqual(0.0, scores['CIDEr'], places=3) self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3) self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3) self.assertEqual(11, len(scores))
def eval(eval_filename, vocab_filename, alias2scientific_filename): def remove_stopwords(sent, stop_word_set): items = sent.split() items = [ite for ite in items if ite not in stop_word_set] return " ".join(items) with open("data/stopwords.txt") as f: stopwords = f.read().strip().split() stopwords = set(stopwords) bleu_nlgeval = NLGEval(metrics_to_omit=[ "METEOR", "CIDEr", "ROUGE_L", "SkipThoughtCS", "EmbeddingAverageCosineSimilairty", "VectorExtremaCosineSimilarity", "GreedyMatchingScore" ]) rouge_eval = RougeEval() disease2x = pandas.read_csv(vocab_filename) disease2x = disease2x[disease2x["Is_know"] > 0] disease2x = dict(zip(list(disease2x["Word"]), list(disease2x["Is_know"]))) distinct_eval = DistinctEval(grams=[1, 2]) with open(eval_filename) as f: sessions = json.load(f) gths = [[episode["gth"] for episode in session["session"]] for session in sessions] hyps = [[episode["hyp"] for episode in session["session"]] for session in sessions] entity_gths = [[ " ".join([i for i in x.split(" ") if i in disease2x]) for x in y ] for y in gths] entity_hyps = [[ " ".join([i for i in x.split(" ") if i in disease2x]) for x in y ] for y in hyps] def flat(lists): tmp = [] for items in lists: tmp += items return tmp gths = flat(gths) hyps = flat(hyps) entity_gths = flat(entity_gths) entity_hyps = flat(entity_hyps) gths = [remove_stopwords(gth, stopwords) for gth in gths] hyps = [remove_stopwords(hyp, stopwords) for hyp in hyps] ret_metrics = OrderedDict() ret_metric = OrderedDict() bleu_score_matrix = [ bleu_nlgeval.compute_individual_metrics([gth], hyp) for gth, hyp in zip(gths, hyps) ] b2s = [b["Bleu_2"] for b in bleu_score_matrix] ret_metrics["B@2"] = b2s bleu_score = bleu_nlgeval.compute_metrics([gths], hyps) b2 = bleu_score["Bleu_2"] ret_metric["B@2"] = b2 rouge1, rouge2, r1s, r2s = rouge_eval.rouge_score(hyps, gths, ret_matrix=True) ret_metrics["R@2"] = r2s ret_metric["R@2"] = rouge2 dist_scores = distinct_eval.distinct_score(hyps) ret_metric["D@1"] = dist_scores[0] ret_metric["D@2"] = dist_scores[1] ret_metrics["D@1"] = float("nan") ret_metrics["D@2"] = float("nan") eps = 1e-24 def compute_f1(p, r): return 2 * p * r / (p + r + eps) overlapped_entity = [[i for i in x.split() if i in y.split()] for x, y in zip(entity_hyps, entity_gths)] overlapped_entity = [list(set(x)) for x in overlapped_entity] hyp_entity = [set(y.split()) for y in entity_hyps] gth_entity = [set(y.split()) for y in entity_gths] entity2prf = OrderedDict() for oe, he, ge in zip(overlapped_entity, hyp_entity, gth_entity): for e in oe: if e not in entity2prf: entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0} entity2prf[e]["TP"] += 1 for e in he: if e not in entity2prf: entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0} if e not in oe: entity2prf[e]["FP"] += 1 for e in ge: if e not in entity2prf: entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0} if e not in oe: entity2prf[e]["FN"] += 1 counter = Counter() for gth in gth_entity: counter.update(gth) need_entity_ind = [x[0] for x in counter.most_common() if x[1] > 5] print("len(need_entity_ind) = {}".format(len(need_entity_ind))) ret_metrics["ma-P"] = [ entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FP"] + eps) for e in need_entity_ind ] ret_metrics["ma-R"] = [ entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FN"] + eps) for e in need_entity_ind ] ret_metrics["ma-F1"] = [ compute_f1(p, r) for (p, r) in zip(ret_metrics["ma-P"], ret_metrics["ma-R"]) ] ret_metric["ma-P"] = float(np.mean(ret_metrics["ma-P"])) ret_metric["ma-R"] = float(np.mean(ret_metrics["ma-R"])) ret_metric["ma-F1"] = compute_f1(ret_metric["ma-P"], ret_metric["ma-R"]) mi_precision = [ len(x) / (len(y) + 1e-14) for x, y in zip( overlapped_entity, [set(y.split()) for y in entity_hyps]) ] mi_recall = [ len(x) / (len(y) + 1e-14) for x, y in zip( overlapped_entity, [set(y.split()) for y in entity_gths]) ] gth_n = [len(set(ws.split())) for ws in entity_gths] hyp_n = [len(set(ws.split())) for ws in entity_hyps] ret_metric["mi-P"] = np.sum([p * w for (p, w) in zip(mi_precision, hyp_n) ]) / np.sum(hyp_n) ret_metric["mi-R"] = np.sum([r * w for (r, w) in zip(mi_recall, gth_n) ]) / np.sum(gth_n) ret_metric["mi-F1"] = compute_f1(ret_metric["mi-P"], ret_metric["mi-R"]) ret_metrics["mi-P"] = mi_precision ret_metrics["mi-R"] = mi_recall ret_metrics["mi-F1"] = [ compute_f1(p, r) for (p, r) in zip(mi_precision, mi_recall) ] with open("data/word2embedding.txt") as f: content = f.read().strip() single_word2embedding = {} for line in content.split("\n"): item = line.split() word = item[0] embedding = np.asarray([float(x) for x in item[1:]]) single_word2embedding[word] = embedding alias2scientific = json.load(open(alias2scientific_filename)) padding_embed = np.zeros(768) hyp_emb_avg = [ np.asarray([ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_hyp.split() ]).mean(0) if len(entity_hyp.split()) > 0 else padding_embed for entity_hyp in entity_hyps ] gth_emb_avg = [ np.asarray([ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_gth.split() ]).mean(0) if len(entity_gth.split()) > 0 else padding_embed for entity_gth in entity_gths ] eas = [cosine_sim(h, g) for h, g in zip(hyp_emb_avg, gth_emb_avg)] ea = float(np.mean(eas)) ret_metrics["EA"] = eas ret_metric["EA"] = ea hyp_emb_means = [[ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_hyp.split() ] if len(entity_hyp.split()) > 0 else [padding_embed] for entity_hyp in entity_hyps] gth_emb_means = [[ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_gth.split() ] if len(entity_gth.split()) > 0 else [padding_embed] for entity_gth in entity_gths] def eval_embed_greedy(a, b): scores = [] for j in b: score = [] for i in a: s = cosine_sim(i, j) score.append(s) scores.append(score) if len(b) == 1 and b[0].sum() == 0.0: return None else: scores = np.asarray(scores) score1 = scores.max(0).mean() score2 = scores.max(1).mean() return (float(score1) + float(score2)) / 2.0 eg_scores = [ x for x in [ eval_embed_greedy(a, b) for (a, b) in zip(hyp_emb_means, gth_emb_means) ] if x is not None ] eg_score = np.asarray(eg_scores).mean() ret_metrics["EG"] = eg_scores ret_metric["EG"] = eg_score return ret_metrics, ret_metric