class NLGMetrics(BaseMetric): def __init__(self, *args, **kwargs): self.nlgeval = NLGEval(no_glove=True, no_skipthoughts=True) @staticmethod def prepare_sent(tokens: List[str]) -> str: return recover_desc(tokens) def eval(self, hypos: Iterable[List[List[str]]], references: Iterable[List[str]], src_references: Iterable[List[str]], *args, **kwargs) -> dict: # List[str] first_hypos = [self.prepare_sent(hypo_list[0]) for hypo_list in hypos] src_ref_strs = [self.prepare_sent(src_ref) for src_ref in src_references] # List[List[str]] references_lists = [[self.prepare_sent(ref) for ref in references]] # distinct metrics_dict = self.nlgeval.compute_metrics(references_lists, first_hypos) # relative improve src_metrics_dict = self.nlgeval.compute_metrics(references_lists, src_ref_strs) relative_metrics_dict = OrderedDict({}) for key in metrics_dict: relative_metrics_dict[key] = (metrics_dict[key] - src_metrics_dict[key]) / src_metrics_dict[key] return { 'Bleu_4': metrics_dict['Bleu_4'], 'METEOR': metrics_dict['METEOR'] }
def main(): references = [[]] hypotheses = [] # Create NlG metrics evaluator nlgeval = NLGEval(metrics_to_omit=['SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity']) with open('/home/jcardoso/MIMIC/encodedTestCaptionsF.json') as json_file: referenceCaptionsDict = json.load(json_file) with open('/home/jcardoso/MIMIC/encodedTrainCaptionsF.json') as json_file: KBCaptionsDict = json.load(json_file) reference_ids = list(referenceCaptionsDict.keys()) KB_ids = list(KBCaptionsDict.keys()) for i in tqdm(range(len(referenceCaptionsDict.keys()))): references[0].append(unifyCaption(referenceCaptionsDict[reference_ids[i]])) hypotheses.append(get_random_report(KB_ids, KBCaptionsDict)) metrics_dict = nlgeval.compute_metrics(references, hypotheses) print(metrics_dict) with open("RandomRefs.txt", 'w+') as file: for reference in references[0]: file.write(reference.strip() + '\n') with open("RandomPreds.txt", 'w+') as file: for hypothesis in hypotheses: file.write(hypothesis.strip() + '\n') with open("random_TestResults.txt", "w+") as file: for metric in metrics_dict: file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
def run_metrics(self, output, refer_dataset): refer = refer_dataset.refer hypothesis = [] references = [] mp1 = 0.0 mp2 = 0.0 mean_objects = 0.0 total = 0.0 for row in output: ref_id = int(row['refID']) gen_sentence = row['gen_sentence'] hypothesis.append(row['gen_sentence']) references.append( [s['sent'] for s in refer.Refs[ref_id]['sentences']]) total += 1.0 mean_objects += row['n_objects'] mp1 += row['p@1'] mp2 += row['p@2'] references = list(zip(*references)) nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=['METEOR']) # loads the models metrics_dict = nlgeval.compute_metrics(references, hypothesis) metrics_dict['p@1'] = mp1 / total metrics_dict['p@2'] = mp2 / total return metrics_dict
def evaluate_trans(thenet, references, vali_data, vali_raw_data): hypothesis = [] score_total = 0. num_word_total = 0 for batch in vali_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate( batch, vali_raw_data) score_total += sum([score[0] for score in pred_scores]) num_word_total += sum(len(x) for x in batch.tgt[1:]) hypothesis.extend([' '.join(x[0]) for x in pred_batch]) ppl = math.exp(-score_total / num_word_total) bleu_score = bleu.corpus_bleu( hypothesis, references)[0][0] #[final, n-gram1,n-gram2,...], [bp, ...] nlg_ref = [[x[0] for x in references if x is not None]] nlg_eval = NLGEval() save_txt('/fl/txtfile/rnn_h1.txt', hypothesis) metrics_eval = nlg_eval.compute_metrics(nlg_ref, hypothesis) print(metrics_eval) print('BLEU: {}'.format(bleu_score)) # training/validation 阶段的ppl计算在onmt/Trainer.py的Statisci()中;translating的ppl计算在 translate.py中的reprot_score函数里 print('PPL: {}'.format(ppl)) return torch.FloatTensor([ppl, bleu_score, 0.0]) # the last reserved for rank number
class NLGMetric(Metric): def __init__(self, config, metric_names=[ "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr" ]): super().__init__(config, metric_names) # please install NLGEval from `https://github.com/Maluuba/nlg-eval` from nlgeval import NLGEval self.nlg = NLGEval() def compute_metrics(self, outputs, targets, **kwargs): return self.nlg.compute_metrics(hyp_list=outputs, ref_list=targets) def print_computed_metrics(self, metrics): Bleu_1 = metrics["Bleu_1"] Bleu_2 = metrics["Bleu_2"] Bleu_3 = metrics["Bleu_3"] Bleu_4 = metrics["Bleu_4"] METEOR = metrics["METEOR"] ROUGE_L = metrics["ROUGE_L"] CIDEr = metrics["CIDEr"] print( "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}" .format(Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr))
def get_evalutation_scores(hypothesis, refrences, testing_mode=False): gleu_scores = {"Gleu_1": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=1), "Gleu_2": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=2), "Gleu_3": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=3), "Gleu_4": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=4) } if testing_mode: for i in range(len(hypothesis)): hypothesis[i] = ' '.join(hypothesis[i]) refs = [[]] for i in range(len(refrences)): refs[0].append(' '.join(refrences[i][0])) if refs[0][-1] == "": refs[0][-1] = "no" refrences = refs n = NLGEval() scores = n.compute_metrics(ref_list=refrences, hyp_list=hypothesis) else: scores = {"Bleu_1": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1.0]), "Bleu_2": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 2, 1. / 2]), "Bleu_3": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 3, 1. / 3, 1. / 3]), "Bleu_4": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 4, 1. / 4, 1. / 4, 1. / 4])} for key, val in gleu_scores.items(): scores[key] = val return scores
def evaluate(hypothesis, references, no_skipthoughts=True, no_glove=True, metrics_to_omit=['METEOR']): nlgeval = NLGEval(no_skipthoughts=no_skipthoughts, no_glove=no_glove, metrics_to_omit=metrics_to_omit) return nlgeval.compute_metrics(references, hypothesis)
def eval_using_nlgeval(ref_list, pred_list, multiple): if VERBOSE: print('Loading the NLG eval model...') nlge = NLGEval(metrics_to_omit=['METEOR', 'CIDEr'], no_skipthoughts=True, no_glove=True) # nlge = NLGEval(metrics_to_omit=['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'CIDEr', 'ROUGE_L'], no_skipthoughts=True, no_glove=True) if VERBOSE: print('\nComputing Scores...') return nlge.compute_metrics(ref_list, pred_list, multiple=multiple)
def com_score(self, ref, pre): # for gold, hype in zip(ref, pre): # temp = [] # temp.append(gold) # metrics_dict = compute_individual_metrics(temp, hype) # break r_list = [] r_list.append(ref) nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics(r_list, pre) return metrics_dict
def meteor(self): """ Computes METEOR using the NLGEval library Link: https://github.com/Maluuba/nlg-eval""" metrics_to_omit = { "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "ROUGE_L", "CIDEr" } nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=metrics_to_omit) self.metrics.update( nlgeval.compute_metrics([self.target], self.hypothesis))
def calculate_rouge(prediction, ground_truth, tokenizer): nlgeval = NLGEval() references = [] hypotheses = [] for x, y in zip(ground_truth, prediction): x = tokenizer.decode(x, skip_special_tokens=True) y = tokenizer.decode(y, skip_special_tokens=True) references.append([x]) hypotheses.append(y) metrics_dict = nlgeval.compute_metrics(references, hypotheses) return metrics_dict['ROUGE_L'], references, hypotheses
def evaluate_model(model, descriptions, photos, tokenizer, max_length): actual, predicted = list(), list() # step over the whole set for key, desc_list in descriptions.items(): # generate description yhat = generate_desc(model, tokenizer, photos[key], max_length) # store actual and predicted references = [d.split() for d in desc_list] actual.append(references) predicted.append(yhat.split()) # calculate all metrics nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(actual, predicted)
def evaluate(loader, lstmDec, linNet, VocabData): Index2Word = dict([val,key] for key,val in VocabData['word_dict'].items()) # dictionary from index to word # if torch.cuda.is_available(): lstmDec = lstmDec.to(device).eval() linNet = linNet.to(device).eval() # nn.DataParallel(linNet,device_ids=[0, 1]).to(device) nlgeval = NLGEval() ld = iter(loader) numiters = len(ld) qdar = tqdm.tqdm(range(numiters), total=numiters, ascii=True) loss_itr_list = [] def linOut2DecIn(global_hidden, box_feat): # box_feat [8, 4, 4096, 3, 3] global_hidden = global_hidden.unsqueeze(0) encoder_hidden = (global_hidden,torch.zeros_like(global_hidden).to(device)) B,M,D,H,W = box_feat.size() encoder_outputs = box_feat.permute(0,1,3,4,2).contiguous().view(B,-1,D) return encoder_hidden, encoder_outputs def lstr(ts,pres=3): return str(np.round(ts.data.cpu().numpy(), 3)) with torch.no_grad(): # evaluate mode references = [[]] hypothesis = [] for i in qdar: # step 1: load data batchdata = next(ld) box_feats, box_global_feats, numBoxes, box_captions_gt = makeInp(*batchdata) # box_feats: (numImage,numBoxes,512,7,7) box_global_feats: list, numImage [(512,34,56)] references[0] += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in box_captions_gt.data.cpu().numpy()] #create batch of reference based on indices # step 2: data transform by linNet box_feat,box_feat_dec, global_hidden = linNet(box_feats, box_global_feats) # step 3: decode to captions by lstmDec encoder_hidden, encoder_outputs = linOut2DecIn(global_hidden,box_feat_dec) decoder_outputs, decoder_hidden, ret_dict = lstmDec(encoder_hidden=encoder_hidden, encoder_outputs=encoder_outputs, max_len=int(5*numBoxes)) # box_feat [8, 4, 4096, 3, 3] # step 4: calculate loss # Loss 1: Similarity loss lengths = torch.LongTensor(ret_dict['length']).to(device) decoder_outputs = torch.stack([decoder_outputs[i] for i in range(len(decoder_outputs))], 1) # decoder_outputs [8, 15, 10878] word_indices = decoder_outputs.argmax(2).data.cpu().numpy() #batch_size x seq_len hypothesis += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in word_indices] #create batch of hypothesis based on indices if i == 10: break print(nlgeval.compute_metrics(references, hypothesis))
def evaluateNLG(gen_dials, ref_dialogues): hyp_list, ref_list = [], [] for fname in gen_dials: hyp_list.extend(gen_dials[fname]) # list of sentence string ref_list.extend([ s.strip() for s in ref_dialogues[fname]['sys'] ]) # list of ref_list, each ref_list is a list of sentence string ref_lists = [ref_list] # only put 1 reference from nlgeval import NLGEval nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(ref_list=ref_lists, hyp_list=hyp_list) print(metrics_dict) return metrics_dict
def NLGE_evaluation(encoder, decoder, search_method, word2ix, ix2word, input_seqs, target_seqs, templates=None): """ Function that computes several metrics using the NLG-eval python package (https://github.com/Maluuba/nlg-eval) :param encoder: Pytorch model that serves as encoder. :param decoder: Pytorch model that serves as decoder. :param search_method: Pytorch model used for making searches during inference. (e.g GreedySearch) :param word2ix: Python dictionary with tokens as keys and indexes as values. :param ix2word: Python dictionary with indexes as keys and tokens as values. :param input_seqs: List containing the vectorized question that will be used for testing the model. :param target_seqs: List containing the vectorized ground truth answers that will be used for testing the model. """ nlg_eval = NLGEval(metrics_to_omit=[ 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'SkipThoughtCS' ]) hypothesis = [] references = [] if templates: vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, strip_accents='ascii') template2vec = vectorizer.fit_transform(templates) for input_seq, target_seq in tqdm(zip(input_seqs, target_seqs), total=input_seqs.shape[0]): input_seq, input_length, _, _, _ = prepare_data([input_seq], [target_seq]) tokens = search_method(input_seq, input_length, 300, word2ix['_BOS_']) tokens = tokens.view( 1, -1 )[0] if search_method.__class__.__name__ == "GreedySearchDecoder" else tokens answer = ' '.join([ ix2word[token] for token in tokens.cpu().numpy() if token != word2ix['_PAD_'] ]) if templates: template, score = template_retrieval(answer, templates, template2vec, vectorizer) if score > 0.75: answer = template hypothesis.append(answer) references.append(' '.join([ix2word[token] for token in target_seq])) return nlg_eval.compute_metrics(ref_list=[references], hyp_list=hypothesis)
def test_oo_api(): with open("examples/hyp.txt") as f: hyp = f.readlines() hyp = [x.strip() for x in hyp] with open("examples/ref1.txt") as f: ref1 = f.readlines() ref1 = [x.strip() for x in ref1] with open("examples/ref2.txt") as f: ref2 = f.readlines() ref2 = [x.strip() for x in ref2] nlge = NLGEval() res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0]) res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1]) hyp_list = hyp ref_list = [ref1, ref2] res = nlge.compute_metrics(ref_list, hyp_list)
def __calculate_scores(result_file, ref_file, block_print=True): reference_file = json.load(open(ref_file)) ref_video_keys = sorted(list(reference_file.keys())) ref_text_list = sum( [reference_file[item]['sentences'] for item in ref_video_keys], []) file_data = json.load(open(result_file)) hyp_text_list = sum( [[i['sentence'].lower() for i in file_data['results'][item]] for item in ref_video_keys], []) hyp_text_list = [ '<NONE>' if len(item) == 0 else item for item in hyp_text_list ] # for empty generated result nlgeval = NLGEval(no_skipthoughts=True, no_glove=True) result = nlgeval.compute_metrics(hyp_list=ref_text_list, ref_list=[hyp_text_list]) metrics = {'Average across tIoUs': result} return metrics
def evaluateNLGFile(gen_dials_fpath, ref_dialogues_fpath): with open(gen_dials_fpath, 'r') as gen, open(ref_dialogues_fpath, 'r') as ref: gen_dials = json.load(gen) ref_dialogues = json.load(ref) hyp_list, ref_list = [], [] for fname in gen_dials: hyp_list.extend(gen_dials[fname]) # list of sentence string ref_list.extend([ s.strip() for s in ref_dialogues[fname]['sys'] ]) # list of ref_list, each ref_list is a list of sentence string ref_lists = [ref_list] # only put 1 reference from nlgeval import NLGEval nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(ref_list=ref_lists, hyp_list=hyp_list) print(metrics_dict) return metrics_dict
def main(): argParser = get_args() print(argParser) print(argParser.checkpoint) if (argParser.checkpoint is not None): modelInfo = torch.load(argParser.checkpoint) # Load model encoder, decoder = setupEncoderDecoder(argParser, modelInfo) # Create data loaders testLoader, _ = setupDataLoaders(argParser) # Load word <-> embeddings matrix index correspondence dictionaries idx2word, word2idx = loadWordIndexDicts(argParser) # Create NlG metrics evaluator nlgeval = NLGEval(metrics_to_omit=[ 'SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity' ]) vocab_size = decoder.vocab_size references, hypotheses = evaluate_beam(argParser, BEAM_SIZE, encoder, decoder, testLoader, word2idx, idx2word) metrics_dict = nlgeval.compute_metrics(references, hypotheses) refs_path, preds_path = save_references_and_predictions( references, hypotheses, argParser.model_name, "Beam") with open( '../Experiments/' + argParser.model_name + "/BeamTestResults.txt", "w+") as file: for metric in metrics_dict: file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
def NLGE_evaluation(model, test_questions, test_answers, train_answers): """ Function that computes several metrics using the NLG-eval python package (https://github.com/Maluuba/nlg-eval) :param model: sklearn tfidf model to be tested. :param test_questions: List containing several questions vectorized. :param test_answers: List containing the ground truth answers vectorized. :param train_answers: the pool of answer that the model will use to search for an answers (typically this pool is all the train answers that the model as seen) """ # Creation of the pool of unique answers. unique_ans = np.unique(train_answers) possible_ans = [ans for ans in unique_ans] # We will not use all the metrics available in the package. nlg_eval = NLGEval(metrics_to_omit=['Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'SkipThoughtCS']) print ("Evaluating ranking among {} possible answers".format(len(possible_ans))) hypothesis = [] # List that will store our answer hypothesis. references = [] # List that will contain the reference answers. vector_doc = model.vectorizer.transform(possible_ans) for i in tqdm(range(len(test_questions))): vector_q = model.vectorizer.transform([test_questions[i]]) result = cosine_similarity(vector_q, vector_doc)[0] hypothesis_idx = np.argsort(result, axis=0)[::-1][0] hypothesis.append(possible_ans[hypothesis_idx]) references.append(test_answers[i]) return nlg_eval.compute_metrics(ref_list=[references], hyp_list=hypothesis)
with open(hyp_file, "r") as f: hyp_dict = { line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:]) for line in f.readlines() } keys = [k for k, v in hyp_dict.items()] labels = [ref_dict[k] for k, _ in hyp_dict.items()] decoded_preds = [v for k, v in hyp_dict.items()] metric = load_metric("bertscore") result_bert = metric.compute( predictions=decoded_preds, references=labels, lang="en", ) nlg = NLGEval() # loads the models print("Key", "\t", "METEOR", "\t", "ROUGE-L") for (key, ref, hyp) in zip(keys, labels, decoded_preds): metrics_dict = nlg.compute_individual_metrics([ref], hyp) print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"]) refs = [[x] for x in labels] metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds) metric = load_metric("rouge") result = metric.compute(predictions=decoded_preds, references=labels) result = {key: value.mid.fmeasure * 100 for key, value in result.items()} print(f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \ {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}")
def cal_score(self, metric): data_score = [] for task_name, task in self.tasks.items(): print("Task : " + task_name + " report ") if "emf1" in metric: em = 0 total = 0 f1 = 0 for pos, predict in enumerate(task['predicted']): em_list = [] f1_list = [] for target in task['targets'][pos]: if _normalize_answer( str(predict) ) == _normalize_answer(str(target)) and len( _normalize_answer(str(predict))) > 0 or len( str(predict)) == len(str(target)) == 0: em_score = 1 f1_score = 1 else: em_score = 0 f1_score = _f1_score(str(predict), str(target)) em_list.append(em_score) f1_list.append(f1_score) em += max(em_list) f1 += max(f1_list) data_score.append([ predict, task['targets'][pos][em_list.index(max(em_list))], { 'em': max(em_list), 'f1': max(f1_list) } ]) total += 1 result = { "EM": em / (total or not total), "F1": f1 / (total or not total) } data_score = sorted(data_score, key=lambda i: i[2]['em'], reverse=True) if "nlg" in metric: try: from nlgeval import NLGEval except ImportError: print( "nlg-eval package not install, plz install it: pip install git+https://github.com/voidful/nlg-eval.git ; nlg-eval --setup ./nlg-eval-data/" ) raise nlgeval = NLGEval(no_skipthoughts=True, no_glove=True, metrics_to_omit=["METEOR"]) targets = task['targets'] predicted = task['predicted'] for t, p in zip(targets, predicted): data_score.append([ p, t, nlgeval.compute_metrics(ref_list=list(map( list, zip(t))), hyp_list=[p]) ]) result = nlgeval.compute_metrics( ref_list=list(map(list, zip(*task['targets']))), # transpose hyp_list=predicted) data_score = sorted(data_score, key=lambda i: i[2]['ROUGE_L']) if "clas" in metric: from sklearn.metrics import classification_report from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics import precision_recall_fscore_support target_key = [ t for t in self.target_list[task_name].keys() if len(t) > 0 ] mlb = MultiLabelBinarizer().fit([target_key]) # remove all blank target task['targets'] = [[j for j in sub if len(j) > 0] for sub in task['targets']] # modify for tagging result if isinstance(task['predicteds'][0][0], list): task['targets'] = sum([[[j] for j in sub] for sub in task['targets']], []) task['predicteds'] = sum([[[j] for j in sub] for sub in task['predicted']], []) if len(task['targets']) != len(task['predicteds']): diff = len(task['targets']) - len(task['predicteds']) task['predicteds'].extend([['']] * diff) targets = task['targets'] predicted = task['predicteds'] for p, t in zip(predicted, targets): score = dict( zip(["precision", "recall", "fbeta_score", "support"], precision_recall_fscore_support( mlb.transform([t]), mlb.transform([p]), average='weighted'))) data_score.append([p, t, score]) print(mlb.classes_) result = classification_report(mlb.transform(targets), mlb.transform(predicted), target_names=list(mlb.classes_)) data_score = sorted(data_score, key=lambda i: i[2]['fbeta_score']) yield (task_name, result, data_score)
def main(): argParser = get_args() print(argParser) modelInfo = None classifierInfo = None if (argParser.checkpoint is not None): modelInfo = torch.load(argParser.checkpoint) if (argParser.use_classifier_encoder) and modelInfo is None: classifierInfo = torch.load(argParser.classifier_checkpoint) if not os.path.isdir('../Experiments/' + argParser.model_name): os.mkdir('../Experiments/' + argParser.model_name) trainingEnvironment = TrainingEnvironment(argParser) cudnn.benchmark = True encoder, decoder = setupEncoderDecoder(argParser, modelInfo, classifierInfo) encoder_optimizer, decoder_optimizer = setupOptimizers( encoder, decoder, argParser, modelInfo) decoder_scheduler, encoder_scheduler = setupSchedulers( encoder_optimizer, decoder_optimizer, argParser) criterion = setupCriterion(argParser.loss) binary_criterion = nn.BCEWithLogitsLoss() trainLoader, valLoader = setupDataLoaders(argParser) # Load word <-> embeddings matrix index correspondence dictionaries idx2word, word2idx = loadWordIndexDicts(argParser) # Create NlG metrics evaluator nlgeval = NLGEval(metrics_to_omit=[ 'SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity' ]) scheduled_sampling_prob = decoder.scheduled_sampling_prob for epoch in range(trainingEnvironment.start_epoch, trainingEnvironment.epochs): if epoch > 1 and argParser.use_scheduled_sampling and epoch % argParser.scheduled_sampling_decay_epochs == 0: scheduled_sampling_prob += argParser.rate_change_scheduled_sampling_prob decoder.scheduled_sampling_prob = scheduled_sampling_prob if trainingEnvironment.epochs_since_improvement == argParser.early_stop_epoch_threshold: break train(argParser, encoder, decoder, trainLoader, word2idx, idx2word, criterion, encoder_optimizer, decoder_optimizer, binary_criterion, epoch) # references, hypotheses = hierarchical_evaluate_beam(argParser, BEAM_SIZE, encoder, decoder, valLoader, word2idx, idx2word) references, hypotheses = evaluate_greedy(argParser, encoder, decoder, valLoader, word2idx, idx2word) encoder_scheduler.step() decoder_scheduler.step() metrics_dict = nlgeval.compute_metrics(references, hypotheses) print(metrics_dict) with open('../Experiments/' + argParser.model_name + "/metrics.txt", "a+") as file: file.write("Epoch " + str(epoch) + " results:\n") for metric in metrics_dict: file.write(metric + ":" + str(metrics_dict[metric]) + "\n") file.write("------------------------------------------\n") recent_bleu4 = metrics_dict['CIDEr'] # Check if there was an improvement is_best = recent_bleu4 > trainingEnvironment.best_bleu4 trainingEnvironment.best_bleu4 = max(recent_bleu4, trainingEnvironment.best_bleu4) print("Best BLEU: ", trainingEnvironment.best_bleu4) if not is_best: trainingEnvironment.epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (trainingEnvironment.epochs_since_improvement, )) else: trainingEnvironment.epochs_since_improvement = 0 # recent_bleu4 = 0 # is_best = True # metrics_dict = {} # Save checkpoint save_checkpoint(argParser.model_name, epoch, trainingEnvironment.epochs_since_improvement, encoder.state_dict(), decoder.state_dict(), encoder_optimizer.state_dict(), decoder_optimizer.state_dict(), recent_bleu4, is_best, metrics_dict, trainingEnvironment.best_loss)
def eval(eval_filename, vocab_filename, alias2scientific_filename): def remove_stopwords(sent, stop_word_set): items = sent.split() items = [ite for ite in items if ite not in stop_word_set] return " ".join(items) with open("data/stopwords.txt") as f: stopwords = f.read().strip().split() stopwords = set(stopwords) bleu_nlgeval = NLGEval(metrics_to_omit=[ "METEOR", "CIDEr", "ROUGE_L", "SkipThoughtCS", "EmbeddingAverageCosineSimilairty", "VectorExtremaCosineSimilarity", "GreedyMatchingScore" ]) rouge_eval = RougeEval() disease2x = pandas.read_csv(vocab_filename) disease2x = disease2x[disease2x["Is_know"] > 0] disease2x = dict(zip(list(disease2x["Word"]), list(disease2x["Is_know"]))) distinct_eval = DistinctEval(grams=[1, 2]) with open(eval_filename) as f: sessions = json.load(f) gths = [[episode["gth"] for episode in session["session"]] for session in sessions] hyps = [[episode["hyp"] for episode in session["session"]] for session in sessions] entity_gths = [[ " ".join([i for i in x.split(" ") if i in disease2x]) for x in y ] for y in gths] entity_hyps = [[ " ".join([i for i in x.split(" ") if i in disease2x]) for x in y ] for y in hyps] def flat(lists): tmp = [] for items in lists: tmp += items return tmp gths = flat(gths) hyps = flat(hyps) entity_gths = flat(entity_gths) entity_hyps = flat(entity_hyps) gths = [remove_stopwords(gth, stopwords) for gth in gths] hyps = [remove_stopwords(hyp, stopwords) for hyp in hyps] ret_metrics = OrderedDict() ret_metric = OrderedDict() bleu_score_matrix = [ bleu_nlgeval.compute_individual_metrics([gth], hyp) for gth, hyp in zip(gths, hyps) ] b2s = [b["Bleu_2"] for b in bleu_score_matrix] ret_metrics["B@2"] = b2s bleu_score = bleu_nlgeval.compute_metrics([gths], hyps) b2 = bleu_score["Bleu_2"] ret_metric["B@2"] = b2 rouge1, rouge2, r1s, r2s = rouge_eval.rouge_score(hyps, gths, ret_matrix=True) ret_metrics["R@2"] = r2s ret_metric["R@2"] = rouge2 dist_scores = distinct_eval.distinct_score(hyps) ret_metric["D@1"] = dist_scores[0] ret_metric["D@2"] = dist_scores[1] ret_metrics["D@1"] = float("nan") ret_metrics["D@2"] = float("nan") eps = 1e-24 def compute_f1(p, r): return 2 * p * r / (p + r + eps) overlapped_entity = [[i for i in x.split() if i in y.split()] for x, y in zip(entity_hyps, entity_gths)] overlapped_entity = [list(set(x)) for x in overlapped_entity] hyp_entity = [set(y.split()) for y in entity_hyps] gth_entity = [set(y.split()) for y in entity_gths] entity2prf = OrderedDict() for oe, he, ge in zip(overlapped_entity, hyp_entity, gth_entity): for e in oe: if e not in entity2prf: entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0} entity2prf[e]["TP"] += 1 for e in he: if e not in entity2prf: entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0} if e not in oe: entity2prf[e]["FP"] += 1 for e in ge: if e not in entity2prf: entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0} if e not in oe: entity2prf[e]["FN"] += 1 counter = Counter() for gth in gth_entity: counter.update(gth) need_entity_ind = [x[0] for x in counter.most_common() if x[1] > 5] print("len(need_entity_ind) = {}".format(len(need_entity_ind))) ret_metrics["ma-P"] = [ entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FP"] + eps) for e in need_entity_ind ] ret_metrics["ma-R"] = [ entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FN"] + eps) for e in need_entity_ind ] ret_metrics["ma-F1"] = [ compute_f1(p, r) for (p, r) in zip(ret_metrics["ma-P"], ret_metrics["ma-R"]) ] ret_metric["ma-P"] = float(np.mean(ret_metrics["ma-P"])) ret_metric["ma-R"] = float(np.mean(ret_metrics["ma-R"])) ret_metric["ma-F1"] = compute_f1(ret_metric["ma-P"], ret_metric["ma-R"]) mi_precision = [ len(x) / (len(y) + 1e-14) for x, y in zip( overlapped_entity, [set(y.split()) for y in entity_hyps]) ] mi_recall = [ len(x) / (len(y) + 1e-14) for x, y in zip( overlapped_entity, [set(y.split()) for y in entity_gths]) ] gth_n = [len(set(ws.split())) for ws in entity_gths] hyp_n = [len(set(ws.split())) for ws in entity_hyps] ret_metric["mi-P"] = np.sum([p * w for (p, w) in zip(mi_precision, hyp_n) ]) / np.sum(hyp_n) ret_metric["mi-R"] = np.sum([r * w for (r, w) in zip(mi_recall, gth_n) ]) / np.sum(gth_n) ret_metric["mi-F1"] = compute_f1(ret_metric["mi-P"], ret_metric["mi-R"]) ret_metrics["mi-P"] = mi_precision ret_metrics["mi-R"] = mi_recall ret_metrics["mi-F1"] = [ compute_f1(p, r) for (p, r) in zip(mi_precision, mi_recall) ] with open("data/word2embedding.txt") as f: content = f.read().strip() single_word2embedding = {} for line in content.split("\n"): item = line.split() word = item[0] embedding = np.asarray([float(x) for x in item[1:]]) single_word2embedding[word] = embedding alias2scientific = json.load(open(alias2scientific_filename)) padding_embed = np.zeros(768) hyp_emb_avg = [ np.asarray([ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_hyp.split() ]).mean(0) if len(entity_hyp.split()) > 0 else padding_embed for entity_hyp in entity_hyps ] gth_emb_avg = [ np.asarray([ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_gth.split() ]).mean(0) if len(entity_gth.split()) > 0 else padding_embed for entity_gth in entity_gths ] eas = [cosine_sim(h, g) for h, g in zip(hyp_emb_avg, gth_emb_avg)] ea = float(np.mean(eas)) ret_metrics["EA"] = eas ret_metric["EA"] = ea hyp_emb_means = [[ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_hyp.split() ] if len(entity_hyp.split()) > 0 else [padding_embed] for entity_hyp in entity_hyps] gth_emb_means = [[ np.asarray([ single_word2embedding.get(w, padding_embed) for w in alias2scientific.get(e, e) ]).mean(0) for e in entity_gth.split() ] if len(entity_gth.split()) > 0 else [padding_embed] for entity_gth in entity_gths] def eval_embed_greedy(a, b): scores = [] for j in b: score = [] for i in a: s = cosine_sim(i, j) score.append(s) scores.append(score) if len(b) == 1 and b[0].sum() == 0.0: return None else: scores = np.asarray(scores) score1 = scores.max(0).mean() score2 = scores.max(1).mean() return (float(score1) + float(score2)) / 2.0 eg_scores = [ x for x in [ eval_embed_greedy(a, b) for (a, b) in zip(hyp_emb_means, gth_emb_means) ] if x is not None ] eg_score = np.asarray(eg_scores).mean() ret_metrics["EG"] = eg_scores ret_metric["EG"] = eg_score return ret_metrics, ret_metric
def main(_argv): if FLAGS.num_gpus > 0: # only supports 1 GPU ctx = mx.gpu() else: ctx = mx.cpu() key_flags = FLAGS.get_key_flags_for_module(sys.argv[0]) print('\n'.join(f.serialize() for f in key_flags)) # are we using features or do we include the CNN? if FLAGS.feats_model is None: backbone_net = get_model(FLAGS.backbone, pretrained=True, ctx=ctx).features cnn_model = FrameModel(backbone_net, 11) # hardcoded the number of classes if FLAGS.backbone_from_id: if os.path.exists(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id)): files = os.listdir(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id)) files = [f for f in files if f[-7:] == '.params'] if len(files) > 0: files = sorted(files, reverse=True) # put latest model first model_name = files[0] cnn_model.load_parameters(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id, model_name), ctx=ctx) print('Loaded backbone params: {}'.format(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id, model_name))) else: raise FileNotFoundError('{}'.format(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id))) if FLAGS.freeze_backbone: for param in cnn_model.collect_params().values(): param.grad_req = 'null' cnn_model = TimeDistributed(cnn_model.backbone) src_embed = cnn_model transform_test = transforms.Compose([ transforms.Resize(FLAGS.data_shape + 32), transforms.CenterCrop(FLAGS.data_shape), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) else: from mxnet.gluon import nn # need to do this to force no use of Embedding on src src_embed = nn.HybridSequential(prefix='src_embed_') with src_embed.name_scope(): src_embed.add(nn.Dropout(rate=0.0)) transform_train = None transform_test = None # setup the data data_train = TennisSet(split='train', transform=transform_train, captions=True, max_cap_len=FLAGS.tgt_max_len, every=FLAGS.every, feats_model=FLAGS.feats_model) data_val = TennisSet(split='val', transform=transform_test, captions=True, vocab=data_train.vocab, every=FLAGS.every, inference=True, feats_model=FLAGS.feats_model) data_test = TennisSet(split='test', transform=transform_test, captions=True, vocab=data_train.vocab, every=FLAGS.every, inference=True, feats_model=FLAGS.feats_model) test_tgt_sentences = data_test.get_captions(split=True) write_sentences(test_tgt_sentences, os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'test_gt.txt')) # load embeddings for tgt_embed if FLAGS.emb_file: word_embs = nlp.embedding.TokenEmbedding.from_file(file_path=os.path.join('data', FLAGS.emb_file)) data_test.vocab.set_embedding(word_embs) input_dim, output_dim = data_test.vocab.embedding.idx_to_vec.shape tgt_embed = gluon.nn.Embedding(input_dim, output_dim) tgt_embed.initialize(ctx=ctx) tgt_embed.weight.set_data(data_test.vocab.embedding.idx_to_vec) else: tgt_embed = None # setup the model encoder, decoder = get_gnmt_encoder_decoder(cell_type=FLAGS.cell_type, hidden_size=FLAGS.num_hidden, dropout=FLAGS.dropout, num_layers=FLAGS.num_layers, num_bi_layers=FLAGS.num_bi_layers) model = NMTModel(src_vocab=None, tgt_vocab=data_test.vocab, encoder=encoder, decoder=decoder, embed_size=FLAGS.emb_size, prefix='gnmt_', src_embed=src_embed, tgt_embed=tgt_embed) model.initialize(init=mx.init.Uniform(0.1), ctx=ctx) static_alloc = True model.hybridize(static_alloc=static_alloc) print(model) if os.path.exists(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id)): files = os.listdir(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id)) files = [f for f in files if f[-7:] == '.params'] if len(files) > 0: files = sorted(files, reverse=True) # put latest model first model_name = files[0] if model_name == 'valid_best.params': model_name = files[1] model.load_parameters(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, model_name), ctx=ctx) print('Loaded model params: {}'.format(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, model_name))) # setup the beam search translator = BeamSearchTranslator(model=model, beam_size=FLAGS.beam_size, scorer=nlp.model.BeamSearchScorer(alpha=FLAGS.lp_alpha, K=FLAGS.lp_k), max_length=FLAGS.tgt_max_len + 100) print('Use beam_size={}, alpha={}, K={}'.format(FLAGS.beam_size, FLAGS.lp_alpha, FLAGS.lp_k)) # run the training train_data_loader, val_data_loader, test_data_loader = get_dataloaders(data_train, data_val, data_test) # load and evaluate the best model if os.path.exists(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params')): model.load_parameters(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params')) preds_path = os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'best_test_out.txt') if not os.path.exists(preds_path): _, test_translation_out = evaluate(test_data_loader, model, translator, data_train, ctx) else: test_translation_out = read_sentences(preds_path) str_ = '' nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics([[' '.join(sent) for sent in test_tgt_sentences]], [' '.join(sent) for sent in test_translation_out]) for k, v in metrics_dict.items(): str_ += ', test ' + k + '={:.4f}'.format(float(v)) print(str_) write_sentences(test_translation_out, preds_path)
sentences=sentences_eval[idx_examples].values, labels=labels_eval[idx_examples].values, tokenizer=tokenizer, max_length_seq=max_length_seq, max_length_label=max_length_label, ) results = generate_questions( model=model, dataset=metric_dataset, tokenizer=tokenizer, device=device, batch_size=args.batch_size, generation_hyperparameters=generation_hyperparameters, ) references, hypothesis = [], [] for elem in results: for i in range(len(elem[0])): references.append(elem[1][i]) hypothesis.append(elem[0][i]) nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics([references], hypothesis) print("Done.") str_ = "" with open(args.output_dir + '/logs.txt', "a") as writer: for metric in metrics_dict: str_ += metric + ": {:.3f}, ".format(metrics_dict[metric]) str_ += "Retrieval score: {:.3f}".format( retrieval_score(hypothesis, references)) writer.write(str_) print(str_)
def BLEU(candidate, references): precisions = [] for i in range(4): pr, bp = count_ngram(candidate, references, i + 1) precisions.append(pr) bleu = geometric_mean(precisions) * bp return bleu if __name__ == "__main__": if len(sys.argv) == 2: candidate, references = fetch_data_from_one(sys.argv[1]) else: candidate, references = fetch_data(sys.argv[1], sys.argv[2]) print(len(candidate)) print(len(references[0])) # candidate, references = fetch_data('bleu_data/tst.txt', 'bleu_data/ref.txt') # bleu1 = BLEU_n(candidate, references, 1) # bleu2 = BLEU_n(candidate, references, 2) # print(bleu1) # print(bleu2) # out = open('data/bleu_out.txt', 'a', encoding='utf8') # out.write(sys.argv[1] + ' ' + str(bleu1) + ' ' + str(bleu2) + '\n') # out.close() from nlgeval import NLGEval nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(references, candidate) print(metrics_dict)
def test_compute_metrics_oo(self): # Create the object in the test so that it can be garbage collected once the test is done. n = NLGEval() # Individual Metrics scores = n.compute_individual_metrics( ref=["this is a test", "this is also a test"], hyp="this is a good test") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5) self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) self.assertEqual(11, len(scores)) scores = n.compute_metrics( ref_list=[ [ "this is one reference sentence for sentence1", "this is a reference sentence for sentence2 which was generated by your model" ], [ "this is one more reference sentence for sentence1", "this is the second reference sentence for sentence2" ], ], hyp_list=[ "this is the model generated sentence1 which seems good enough", "this is sentence2 which has been generated by your model" ]) self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5) self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.295797, scores['METEOR'], places=5) self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5) self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5) self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) self.assertEqual(11, len(scores)) # Non-ASCII tests. scores = n.compute_individual_metrics( ref=["Test en français.", "Le test en français."], hyp="Le test est en français.") self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) self.assertAlmostEqual(0.0000051, scores['Bleu_3'], places=5) self.assertAlmostEqual(0, scores['Bleu_4'], places=5) self.assertAlmostEqual(0.48372379050300296, scores['METEOR'], places=5) self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) self.assertAlmostEqual(0.9192341566085815, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5) self.assertEqual(11, len(scores)) scores = n.compute_individual_metrics(ref=["テスト"], hyp="テスト") self.assertAlmostEqual(0.99999999, scores['Bleu_1'], places=5) self.assertAlmostEqual(1.0, scores['METEOR'], places=3) self.assertAlmostEqual(1.0, scores['ROUGE_L'], places=3) self.assertAlmostEqual(0.0, scores['CIDEr'], places=3) self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3) self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3) self.assertEqual(11, len(scores))
# model1_answers.append(id2word(outputs[0][i])) model_answers_id.append([m for m in outputs[0][i] if m > 2]) true_answers.append(batch_answer_str[i]) model_answers.append( id2word(outputs[0][i]).replace("<EOS>", "")) # answers_save_path = os.path.join(args.savePath, "answer_save_beam2_23.json") # with open(answers_save_path, "w", encoding='UTF-8') as file: # data = {"true_answers": true_answers, # "model1_answers": model_answers} # json.dump(data, file, ensure_ascii=False) # print("save in ", answers_save_path) print("num_batch:", bbb, "beam_wide=", args.num_BeamSearch) model_n_b_beam2_metrics_dict = nlgeval.compute_metrics([true_answers], model_answers) print("model_n_b_beam2:\n", model_n_b_beam2_metrics_dict) print("Bleu_total:", np.mean(Bleu_total)) print("Bleu_total_1:", np.mean(Bleu_total_1)) print("Bleu_total_2:", np.mean(Bleu_total_2)) print("Bleu_total_3:", np.mean(Bleu_total_3)) print("Bleu_total_4:", np.mean(Bleu_total_4)) Bleu_total_all.append(np.mean(Bleu_total)) Bleu_total_1_all.append(np.mean(Bleu_total_1)) Bleu_total_2_all.append(np.mean(Bleu_total_2)) Bleu_total_3_all.append(np.mean(Bleu_total_3)) Bleu_total_4_all.append(np.mean(Bleu_total_4)) print("F1_score:", np.mean(F1_score)) F1_score_all.append(np.mean(F1_score))
# Perform nearest neighbour search using dot product # Given that vectors are normalized, this is equivalent to using cosine similarity D, I = index.search(encoder_out.to("cpu").detach().numpy(), k) for caption in caps: encodedCaption = [ w for w in caption.tolist() if w not in {word2idx['<sos>'], word2idx['<eoc>'], word2idx['<pad>']} ] references[0].append(decodeCaption(encodedCaption, idx2word)) for i in range(batch_size): hypotheses.append(train_captions[I[i][0]]) return references, hypotheses generate_train_images_matrix() references, hypotheses = calculate_NN() metrics_dict = nlgeval.compute_metrics(references, hypotheses) print(metrics_dict) with open("1NNRefs.txt", 'w+') as file: for reference in references[0]: file.write(reference.strip() + '\n') with open("1NNPreds.txt", 'w+') as file: for hypothesis in hypotheses: file.write(hypothesis.strip() + '\n')