def betterAnswer(baseline, new, questionBody): infile_true = open("./input/BioASQ-trainingDataset5b.json", 'r') data_true = json.load(infile_true) for (i, question_i) in enumerate(data_true['questions']): if question_i['body'].strip() == questionBody.strip(): r = Rouge() manual_summmary = question_i['ideal_answer'][0] [precision_base, recall_base, f_score_base] = r.rouge_l([baseline], [manual_summmary]) [precision_new, recall_new, f_score_new] = r.rouge_l([new], [manual_summmary]) print "============================================" print "Ideal_answer \n" print manual_summmary print "Fused_answer %f %f \n" % (precision_new, recall_new) print new print "Baseline_answer %f %f \n" % (precision_base, recall_base) print baseline print "============================================" if f_score_base < f_score_new: print "11111" return new else: print "22222" return baseline return None
def read_dataset(): dataset = {} r = Rouge() threshold = .70 stories = get_dataset() for i in range(0, 2000): labeled_articles = set() story = stories[i]['story'] highlights = stories[i]['highlights'] for sent in story: for highlight in highlights: [precision, recall, f_score] = r.rouge_l(sent, highlight) if f_score > threshold: labeled_articles.add((sent, 'yes')) else: labeled_articles.add((sent, 'no')) dataset[i] = labeled_articles return dataset
def score_model(test_pairs, model, model_id, nb_examples, output_type): scores = [0, 0, 0, 0] rouge_calc = RougeCalculator(stopwords=True, lang="en") pyRouge = Rouge() if output_type == 'greedy': results = predict_greedy(test_pairs, _range=(0, nb_examples), model=model) else: results = predict_from_data(test_pairs, _range=(0, nb_examples), model=model) summaries = [] novelty_dist = [] for d in range(11): novelty_dist.append([]) for k in results: el = results[k] ref = " ".join([t for t in el['ref'].split('EOS')[0].split(" ")]) summary = " ".join( [t for t in el[output_type].split('EOS')[0].split(" ")]) scores[0] += rouge_calc.rouge_1(summary, ref) scores[1] += rouge_calc.rouge_2(summary, ref) rouge_l = rouge_calc.rouge_l(summary, ref) '''''' n = novelty.compute_novelty(ref, el['text'], 3) novelty_dist[int(n * 10)].append(rouge_l) ''' print(round(rouge_calc.rouge_2(summary , ref), 3), round(rouge_l, 3), len(summary.split(" ")), summary) print(ref) print(rouge_calc.rouge_2(summary , ref), rouge_l) print(summary) print(ref) print() ''' scores[2] += rouge_l if rouge_l < 0.20 or True: summaries.append((rouge_l, el[output_type].split('EOS')[0])) for i in range(10): print((i + 1) * 10, sum(novelty_dist[i]) / len(novelty_dist[i]), len(novelty_dist[i])) '''
from PyRouge.pyrouge import Rouge r = Rouge() system_generated_summary = "The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." manual_summmary = "The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " [precision, recall, f_score] = r.rouge_l([system_generated_summary], [manual_summmary]) print("Precision is :"+str(precision)+"\nRecall is :"+str(recall)+"\nF Score is :"+str(f_score))
from PyRouge.pyrouge import Rouge r = Rouge() # A simple eample of how rouge can be calculated #print r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]], [[1, 2, 3, 4, 5], [3, 9, 5]]) # A more practical example of how it can be used for summary evaluation system_generated_summary = " The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." manual_summmary = " The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " #print r.rouge_l([system_generated_summary], [manual_summmary]) ACTUALABSTRACT = '<s> a native american from a tribe not recognized by the feds wins the return of his eagle feathers . </s> <s> an irs accountant is fired for insisting on carrying a symbolic sikh knife to work . </s> <s> a group of chicago pastors takes on city hall over its permits for new churches and loses . </s>' GENERATEDABSTRACT = '<go> <s> united states have been growing since the u.s. religious freedom restoration act . </s> <s> the united states have been growing since the u.s. religious freedom restoration act . </s> <s> new : `` there is reason to doubt whether these state-level religious protections '' </s> <end>' [precision, recall, f_score] = r.rouge_l([ACTUALABSTRACT], [GENERATEDABSTRACT]) print("Precision is :"+str(precision)+"\nRecall is :"+str(recall)+"\nF Score is :"+str(f_score))
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed from tensorflow.keras.models import Model from tensorflow.keras.callbacks import EarlyStopping import warnings pd.set_option("display.max_colwidth", 200) warnings.filterwarnings("ignore") import tensorflow as tf import os from tensorflow.python.keras.layers import Layer from tensorflow.python.keras import backend as K from PyRouge.pyrouge import Rouge r = Rouge() class AttentionLayer(Layer): """ This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf). There are three sets of weights introduced W_a, U_a, and V_a """ def __init__(self, **kwargs): super(AttentionLayer, self).__init__(**kwargs) def build(self, input_shape): assert isinstance(input_shape, list) # Create a trainable weight variable for this layer. self.W_a = self.add_weight(name='W_a',
import nltk from itertools import zip_longest from nltk.tokenize import word_tokenize from nltk.translate.bleu_score import sentence_bleu from PyRouge.pyrouge import Rouge r = Rouge() list = [] def accuracy(): with open( "C:/BigDataAnalyticsAppns/Tutorial 6 Source Code/medium-show-and-tell-caption-generator-master/etc/pred.txt" ) as f2, open( "C:/BigDataAnalyticsAppns/Tutorial 6 Source Code/medium-show-and-tell-caption-generator-master/etc/true_text.txt" ) as f1: k = 0 for line in f2: list.append(line) for line1 in f1: i = 0 y_true = list[i] y_true_line = [] i = i + 1 y_pred = (line1) BLEUscore = sentence_bleu(word_tokenize(y_true), word_tokenize(y_pred), weights=(1, 0, 0, 0)) [precision, recall, f_score] = r.rouge_l([y_true], [y_pred]) print("Precision is :" + str(precision) + "\nRecall is :" + str(recall) + "\nF Score is :" + str(f_score))
from PyRouge.pyrouge import Rouge r = Rouge() fptr1 = open('test.eval_titles.txt') fptr2 = open('eval_articles.1_300000.txt') system_summaries = fptr1.readlines() #.split() model_summaries = fptr2.readlines() #.split() avg_p = avg_r = avg_f1 = 0 for i in range(len(system_summaries)): [precision, recall, f_score] = r.rouge_l([system_summaries[i]], [model_summaries[i]]) avg_p += precision avg_r += recall avg_f1 += f_score print("Sentence:", i) print("Human:", system_summaries[i]) print("Model:", model_summaries[i]) print("Precision is :" + str(precision) + "\nRecall is :" + str(recall) + "\nF Score is :" + str(f_score)) print() print("----------------------Final eval-------------------") print("Precision:", (float)(avg_p / len(system_summaries))) print("Recall:", (float)(avg_r / len(system_summaries))) print("F1-score:", (float)(avg_f1 / len(system_summaries)))
def main(args): # Setting warnings.simplefilter("ignore", UserWarning) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Args Parser hj_method = args.hj_method kr_method = args.kr_method batch_size = args.batch_size beam_size = args.beam_size hidden_size = args.hidden_size embed_size = args.embed_size vocab_size = args.vocab_size max_len = args.max_len padding_index = args.pad_id n_layers = args.n_layers stop_ix = args.stop_ix # Load saved model & Word2vec save_path = 'save_{}_{}_{}_maxlen_{}'.format(vocab_size, hj_method, kr_method, max_len) save_list = sorted(glob.glob(f'./save/{save_path}/*.*')) save_pt = save_list[-1] print('Will load {} pt file...'.format(save_pt)) word2vec_hj = Word2Vec.load('./w2v/word2vec_hj_{}_{}.model'.format( vocab_size, hj_method)) # SentencePiece model load spm_kr = spm.SentencePieceProcessor() spm_kr.Load("./spm/m_korean_{}.model".format(vocab_size)) # Test data load with open('./test_dat.pkl', 'rb') as f: test_dat = pickle.load(f) test_dataset = CustomDataset(test_dat['test_hanja'], test_dat['test_korean']) test_loader = getDataLoader(test_dataset, pad_index=padding_index, shuffle=False, batch_size=batch_size) # Model load print('Model loading...') encoder = Encoder(vocab_size, embed_size, hidden_size, word2vec_hj, n_layers=n_layers, padding_index=padding_index) decoder = Decoder(embed_size, hidden_size, vocab_size, n_layers=n_layers, padding_index=padding_index) seq2seq = Seq2Seq(encoder, decoder, beam_size).cuda() #optimizer = optim.Adam(seq2seq.parameters(), lr=lr, weight_decay=w_decay) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step_size, gamma=lr_decay) print(seq2seq) print('Testing...') start_time = time.time() results = test(seq2seq, test_loader, vocab_size, load_pt=save_pt, stop_ix=stop_ix) print(time.time() - start_time) print('Done!') print("Decoding...") pred_list = list() for result_text in tqdm(results): text = torch.Tensor(result_text).squeeze().tolist() text = [int(x) for x in text] prediction_sentence = spm_kr.decode_ids( text).strip() # Decode with strip pred_list.append(prediction_sentence) ref_list = list() for ref_text in tqdm(test_dat['test_korean'][:stop_ix]): ref_list.append(spm_kr.decode_ids(ref_text).strip()) print('Done!') with open(f'./save/{save_path}/test_result.pkl', 'wb') as f: pickle.dump({ 'pred': pred_list, 'reference': ref_list, }, f) print('Save file; /test_dat.pkl') # Calculate BLEU Score print('Calculate BLEU4, METEOR, Rogue-L...') chencherry = SmoothingFunction() bleu4 = corpus_bleu(test_dat['reference'], test_dat['pred'], smoothing_function=chencherry.method4) print('BLEU Score is {}'.format(bleu4)) # Calculate METEOR Score meteor = meteor_score(test_dat['reference'], test_dat['pred']) print('METEOR Score is {}'.format(meteor)) # Calculate Rouge-L Score r = Rouge() total_test_length = len(test_dat['reference']) precision_all = 0 recall_all = 0 f_score_all = 0 for i in range(total_test_length): [precision, recall, f_score] = r.rouge_l([test_dat['reference'][i]], [test_dat['pred'][i]]) precision_all += precision recall_all += recall f_score_all += f_score print('Precision : {}'.foramt(round(precision_all / total_test_length, 4))) print('Recall : {}'.foramt(round(recall_all / total_test_length, 4))) print('F Score : {}'.foramt(round(f_score_all / total_test_length, 4)))
from PyRouge.pyrouge import Rouge r = Rouge() # A simple eample of how rouge can be calculated print r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]], [[1, 2, 3, 4, 5], [3, 9, 5]]) # A more practical example of how it can be used for summary evaluation system_generated_summary = " The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." manual_summmary = " The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " print r.rouge_l([system_generated_summary], [manual_summmary])
# Here, we have to first process the 3 sentences. Not adjacent -- form unigrams and bigrams -- compute ROUGE-N precision, ROUGE-N recall # ROUGE-N f1 score # Also compute the ROUGE-L score # Run this code using Python 2.7, otherwise the string.translate() does not work from PyRouge.pyrouge import Rouge import pandas as pd from nltk import word_tokenize from nltk.util import ngrams import string import math # Computing ROUGE-L precision, recall and F1 score r_summ_evaluate = Rouge() # Trying with a sample summary # system_generated_summary = " The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." # manual_summmary = " The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " # print(r_summ_evaluate.rouge_l([system_generated_summary], [manual_summmary])) # Here we read the file which mentions the 3 sentences for each technique, in three adjacent columns #summ_data_readpath = '/home/soumyadeep/PycharmProjects/MSThesisWorkMay2019/JournalCodes/data/GoldDataOnly36_12thSept.csv' # 1 : X.AUTHID, 2 : spec_domain, 3: site.content 4-6: GOLD, 7-9 : LEAD, 10-12 : RAND, 13-15 : INCONST, 16-18 : INCONST_NEG, 19-21 : HYBRID1 #summ_data_readpath = '/home/soumyadeep/PycharmProjects/MSThesisWorkMay2019/JournalCodes/data/GoldDataAllAspectsWithHybrid_13thSept.csv' # 1 : X.AUTHID, 2 : spec_domain, 3: site.content 4-6: GOLD, 7-9: LEAD, 10-12: Hybrid, 13-15: Incons, 16-18: Neg, 19-21: Ctr, 22-24: InconsNeg summ_data_readpath = '/home/soumyadeep/PycharmProjects/MSThesisWorkMay2019/JournalCodes/data/GoldDataAllAspects_Annotate_Partial_17thSept.csv'
def rouge_l(S, I): r = Rouge() [precision, recall, f_score] = r.rouge_l([S], [I]) return f_score