Пример #1
0
class EvaluateNL():
    def __init__(self):
        self.eval = NLGEval(no_skipthoughts=True,
                            no_glove=True,
                            metrics_to_omit=[
                                'EmbeddingAverageCosineSimilairty',
                                'VectorExtremaCosineSimilarity',
                                'GreedyMatchingScore'
                            ])

    def compute(self, refs, hyps):
        data = []
        for i, ref in enumerate(refs):
            ref = ref.replace('\n', '')
            hyp = hyps[i].replace('\n', '')

            if not ref:
                continue

            scores = self.eval.compute_individual_metrics(ref=[ref], hyp=hyp)
            scores = sorted(scores.items())
            self._metrics = [s[0] for s in scores]

            #data.append([ref, hyp])
            data.append([
                ref, hyp, *[
                    str(float('%0.6f' % (s[1]))).replace('.', ',')
                    for s in scores
                ]
            ])

        return pd.DataFrame(data,
                            columns=['Reference', 'Hypotesi', *self._metrics])
Пример #2
0
def get_all_nlgeval_metrics(complex_sentence, simple_sentence):
    if 'NLGEVAL' not in globals():
        global NLGEVAL
        print('Loading NLGEval models...')
        # Change False to True if you want to use skipthought or glove
        NLGEVAL = NLGEval(no_skipthoughts=True, no_glove=True)
        print('Done.')
    return NLGEVAL.compute_individual_metrics([complex_sentence], simple_sentence)
Пример #3
0
    def test_compute_metrics_empty(self):
        n = NLGEval()

        # One of the ref is empty
        scores = n.compute_individual_metrics(ref=["this is a test",
                                                   ""],
                                              hyp="this is a good test")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilarity'], places=5)
        self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty'])
        self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
        self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
        self.assertEqual(12, len(scores))

        # Empty hyp
        scores = n.compute_individual_metrics(ref=["this is a good test"],
                                              hyp="")
        self.assertAlmostEqual(0, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0, scores['METEOR'], places=5)
        self.assertAlmostEqual(0, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0, scores['EmbeddingAverageCosineSimilarity'], places=5)
        self.assertEqual(scores['EmbeddingAverageCosineSimilarity'], scores['EmbeddingAverageCosineSimilairty'])
        self.assertAlmostEqual(0, scores['VectorExtremaCosineSimilarity'], places=5)
        self.assertAlmostEqual(0, scores['GreedyMatchingScore'], places=5)
        self.assertEqual(12, len(scores))
Пример #4
0
    def test_compute_metrics_omit(self):
        n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilarity'])

        # Individual Metrics
        scores = n.compute_individual_metrics(ref=["this is a test",
                                                   "this is also a test"],
                                              hyp="this is a good test")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
        self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
        self.assertEqual(7, len(scores))
Пример #5
0
def main(args):
    nlgeval = NLGEval(no_skipthoughts=True, no_glove=True)

    samples = {}
    with open(args.gen_file) as f:
        for line in tqdm(f):
            hypo, refs = line.rstrip().split('\t')
            metrics_dict = nlgeval.compute_individual_metrics(
                refs.split('*#'), hypo)
            samples[(hypo, refs)] = metrics_dict['Bleu_4']

    for hypo, refs in sorted(samples.keys(), key=samples.__getitem__)[:args.num_samples]:
        print('BLEU:', samples[(hypo, refs)])
        print('H:', hypo)
        for r in refs.split('*#'):
            print('R:', r)
        print('---')
def test_oo_api():
    with open("examples/hyp.txt") as f:
        hyp = f.readlines()
        hyp = [x.strip() for x in hyp]
    with open("examples/ref1.txt") as f:
        ref1 = f.readlines()
        ref1 = [x.strip() for x in ref1]
    with open("examples/ref2.txt") as f:
        ref2 = f.readlines()
        ref2 = [x.strip() for x in ref2]

    nlge = NLGEval()

    res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0])
    res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1])

    hyp_list = hyp
    ref_list = [ref1, ref2]
    res = nlge.compute_metrics(ref_list, hyp_list)
Пример #7
0
def bleu(model_name, mode, num_samples):
    sampled_file_dst = []
    for i in range(num_samples):
        sampled_file_dst.append(
            f'logs/{model_name}/samples/sampled_{mode}_{i}.txt')

    source_file_dst = f'logs/{model_name}/samples/source_{mode}.txt'

    sampled = [
        list(np.loadtxt(sampled_file_dst[i], dtype='U', delimiter='\n'))
        for i in range(num_samples)
    ]
    source = list(np.loadtxt(source_file_dst, dtype='U', delimiter='\n'))

    nlgeval = NLGEval(
        metrics_to_omit=['METEOR', 'ROUGE_L', 'CIDEr', 'SkipThoughtCS'])
    best_bleu = []
    print(len(source), len(sampled),
          [len(sampled[i]) for i in range(len(sampled))])
    for i in range(len(source)):
        curr_best = ('', 0.0)
        for j in range(num_samples):
            score = nlgeval.compute_individual_metrics([source[i]],
                                                       sampled[j][i])['Bleu_4']
            if score > curr_best[1]:
                curr_best = (sampled[j][i], score)
        if i % 100 == 0:
            print(f'Sentence pair {i}.{j}:')
            print('source : ', source[i])
            print('sampled : ', curr_best[0])
            print('score : ', curr_best[1])
            print('\n')
        best_bleu.append(curr_best[0])
    np.savetxt(f'logs/{model_name}/samples/best_bleu_{mode}',
               np.array(best_bleu),
               delimiter='\n',
               fmt='%s')
Пример #8
0
    countGenUsed = 0
    countRefUsed = 0
    for id in srcSen:
        countRefUsed += 1
        if id not in gen:
            continue;
        hypo = gen[id]
        r = ref[id]
        countGenUsed += len(hypo)

	      if verbose:
            print('Computing sentence ' + str(countRefUsed))

	      for h in hypo:
            metrics_dict = nlgeval.compute_individual_metrics(r, h)
            for m in metrics:
                results[m] += metrics_dict[m]
                resultsSolo[m].append(metrics_dict[m])
        if verbose:
            print('Parsed questions: ' + str(countGenUsed))


    if verbose:
        print('Writting files to ' + name + '...')

    for m in metrics:
        with open(name+'/'+m, 'w') as f:
            f.write(m+'\t\t\t\n')
            f.write(str(results[m]/countGenUsed)+'\t\t\t\n')
            f.write(m+'\t\t\t\n')
Пример #9
0
def normalize(target_folder, remove_sim, too_sim_threshold, result_folder):
    data_dict = defaultdict(lambda: defaultdict(dict))
    output_dict = defaultdict(list)
    FILE_TEST = [i for i in os.listdir(target_folder) if ".jsonl" in i]
    # {'answers': ['a'], 'options': [['a','b']], 'questions': ['q1'], 'article': "", 'id': 'middle2572.txt'}
    print(FILE_TEST)
    print("====size====")
    # count total size of each prediction
    for FILE in FILE_TEST:
        with open(os.path.join(target_folder, FILE), 'r',
                  encoding='utf8') as jsonlfile:
            for jlines in jsonlfile.readlines():
                jfile = json.loads(jlines)
                for q in jfile['questions']:
                    dict_id = jfile['article'].strip() + q.strip()
                    dict_id = dict_id.replace(" ", "").lower()
                    data_dict[dict_id][FILE] = jfile

    for _, testfiles in data_dict.items():
        if len(testfiles) == len(FILE_TEST):
            for fname, fcontent in testfiles.items():
                output_dict[fname].append(fcontent)
    print("Total", len(data_dict), len(output_dict))

    print("====similarity====")
    from nlgeval import NLGEval

    n = NLGEval(metrics_to_omit=[
        'METEOR', 'EmbeddingAverageCosineSimilairty', 'SkipThoughtCS',
        'VectorExtremaCosineSimilarity', 'GreedyMatchingScore', 'CIDEr'
    ])
    for task, datas in output_dict.items():
        overall_dict = defaultdict(list)
        toosim_dict = defaultdict(list)
        overall_result = dict()
        toosim_result = dict()
        for v in datas:
            if len(v['options'][0]) <= 4:
                # {'Bleu_1': 0.19999999996000023, 'Bleu_2': 7.071067810274489e-09, 'Bleu_3': 2.5543647739782087e-11, 'Bleu_4': 1.699044244302013e-12, 'METEOR': 0.0547945205479452, 'ROUGE_L': 0.26180257510729615, 'CIDEr': 0.0, 'SkipThoughtCS': 0.41264296, 'EmbeddingAverageCosineSimilairty': 0.804388, 'VectorExtremaCosineSimilarity': 0.650115, 'GreedyMatchingScore': 0.655746}
                example_dict = defaultdict(list)
                if "two" in target_folder:  # answer with one option - check answer copying problem
                    opt = v['options'][0]
                    # [opt[1]] - ground truth/answer
                    metrics_dict = n.compute_individual_metrics([opt[1]],
                                                                opt[0])
                    for mk, mv in metrics_dict.items():
                        if np.max(mv) > too_sim_threshold:
                            toosim_dict[mk].append(1)
                            if remove_sim:
                                if v in datas:
                                    del output_dict[task][datas.index(v)]
                                break
                        overall_dict[mk].append(mv)
                else:
                    for i in set(it.combinations(v['options'][0], 2)):
                        if len(i[0]) == 0 or len(i[1]) == 0:
                            continue
                        metrics_dict = n.compute_individual_metrics([i[0]],
                                                                    i[1])
                        for mk, mv in metrics_dict.items():
                            example_dict[mk].append(mv)

                    for mk, mv in example_dict.items():
                        if np.max(mv) > too_sim_threshold:
                            toosim_dict[mk].append(1)
                            if remove_sim:
                                if v in datas:
                                    del output_dict[task][datas.index(v)]
                                break
                        overall_dict[mk].append(np.mean(mv))

        for mk, mv in overall_dict.items():
            overall_result[mk] = np.mean(mv)
        for mk, mv in toosim_dict.items():
            toosim_result[mk] = np.sum(mv)
        print(task, overall_result, "\nToo Sim: ", toosim_result)

    data_dict = defaultdict(lambda: defaultdict(dict))
    for FILE, datas in output_dict.items():
        for data in datas:
            for q in data['questions']:
                dict_id = data['article'].strip() + q.strip()
                dict_id = dict_id.replace(" ", "").lower()
                data_dict[dict_id][FILE] = data

    normalized_dict = defaultdict(list)
    print("Total", len(data_dict))
    for _, testfiles in data_dict.items():
        if len(testfiles) == len(FILE_TEST):
            for fname, fcontent in testfiles.items():
                normalized_dict[fname].append(fcontent)

    print("====output====")
    for f, clist in normalized_dict.items():
        print("Normalized", f, len(clist))
        with jsonlines.open(os.path.join(result_folder, f),
                            mode='w') as writer:
            writer.write_all(clist)
Пример #10
0
def main(args):
    good_hypos = load_texts(args.good_gen, False)
    bad_hypos = load_texts(args.bad_gen, False)
    refs = load_texts(args.ref_file, True)

    with open(args.dataset_file) as f:
        data_in = json.load(f)

    if args.webnlg:
        triples = []
        for entry in tqdm(data_in['entries']):
            for e in entry.values():
                triples.append(generate_triples_webnlg(
                    e["modifiedtripleset"]))
    else:
        triples = []
        for article in tqdm(data_in):
            triples.append(generate_triples_agenda(article["relations"]))
            if not args.ignore_isolated:
                for entity in article["entities"]:
                    triples[-1].append((entity, "", entity))

    nlgeval = NLGEval(
        metrics_to_omit=[
            'METEOR',
            'ROUGE_L',
            'CIDEr'
        ], no_skipthoughts=True, no_glove=True
    )

    interesting_samples = []
    for i, (gh, bh, ref_list, tripset) in tqdm(list(
            enumerate(zip(good_hypos, bad_hypos, refs, triples)))):
        metrics_dict = nlgeval.compute_individual_metrics(ref_list, gh)
        good_bleu = metrics_dict['Bleu_4']
        metrics_dict = nlgeval.compute_individual_metrics(ref_list, bh)
        bad_bleu = metrics_dict['Bleu_4']

        graph = generate_graph(tripset)
        num_ccs = nx.number_connected_components(graph)
        num_nodes = len(graph)
        mean_cc_size = num_nodes / num_ccs

        diameters = []
        for subgraph in [graph.subgraph(cc).copy() for cc in nx.connected_components(graph)]:
            diameters.append(nx.diameter(subgraph))
        max_diameter = max(diameters)

        good_rep_rate = compute_repetition_rate(gh)[-1]
        bad_rep_rate = compute_repetition_rate(bh)[-1]

        if (good_bleu > bad_bleu) and\
           (args.min_mean_cc_size <= mean_cc_size < args.max_mean_cc_size) and\
           (args.min_max_diameter <= max_diameter < args.max_max_diameter) and\
           (good_rep_rate < bad_rep_rate):
            interesting_samples.append(
                (i, good_bleu, bad_bleu, good_rep_rate, bad_rep_rate,
                 mean_cc_size, max_diameter))

    for i, gb, bb, grep, brep, mean_cc, max_dia in sorted(
            interesting_samples, key=lambda x: x[3])[:args.num_samples]:
        print('Sample #', i)
        print('Good BLEU:', gb, 'Bad BLEU:', bb, 'Difference:', gb-bb)
        print('Good RepRate:', grep, 'Bad RepRate:', brep)
        print('Mean CC size:', mean_cc, 'Max diameter:', max_dia)
        print('Good Hypo:', good_hypos[i])
        print('Bad Hypo:', bad_hypos[i])
        for r in refs[i]:
            print('R:', r)
        print('---')
Пример #11
0
class Metrics_Calculator(object):

    def __init__(self,hparams,glove_comparer):

      
        super(Metrics_Calculator, self).__init__()
        self.nlg_eval = NLGEval(metrics_to_omit=['EmbeddingAverageCosineSimilairty', 'EmbeddingAverageCosineSimilarity','GreedyMatchingScore','SkipThoughtCS','VectorExtremaCosineSimilarity'])
        self.list_dict_track  = {"data":[]}
        self.hparams = hparams
        self.glove_comparer = glove_comparer

        
    def build_json_results(self,
                           context,
                           generated_question_list,
                           target_question_list,
                           row_mean_metrics):

        """
        Cria json para cada linha que será salvo para monitorar as métricas em self.list_dict_track
        """
        new_info = {}
        new_info["context"] =context
        new_info["generated_question_list"] =generated_question_list
        new_info["target_question_list"] =target_question_list
        new_info["row_mean_metrics"] =row_mean_metrics

        
        return new_info

    def track_metrics_row(self,original_target,gen_target_options_list):
        """ 
        Calcula as métricas para cada par question-context
        """
        bleu_1_list = []
        bleu_2_list = []
        bleu_3_list = []
        bleu_4_list = []
        CIDEr_list = []
        ROUGE_L_list = []
        cossine_similarity_list = []

        for gen_target_option in gen_target_options_list:
          
          metrics_dict = self.nlg_eval.compute_individual_metrics(ref=[original_target],hyp=gen_target_option)#ref:List[str] , hyp:str
          bleu_1_list.append(metrics_dict['Bleu_1'])
          bleu_2_list.append(metrics_dict['Bleu_2'])
          bleu_3_list.append(metrics_dict['Bleu_3'])
          bleu_4_list.append(metrics_dict['Bleu_4'])
          CIDEr_list.append(metrics_dict['CIDEr'])
          ROUGE_L_list.append(metrics_dict['ROUGE_L'])
          cs = self.glove_comparer.compare_sentences_with_cossine_similarity(original_target,gen_target_option)
          cossine_similarity_list.append(cs)
          
          

        row_metrics_dict = {"Bleu_1":np.mean(bleu_1_list),
                             "Bleu_2":np.mean(bleu_2_list),
                             "Bleu_3":np.mean(bleu_3_list),
                             "Bleu_4":np.mean(bleu_4_list),
                             "CIDEr":np.mean(CIDEr_list),
                             "ROUGE_L":np.mean(ROUGE_L_list),
                             "Glove_Cossine_Similarity":np.mean(cossine_similarity_list)}

        return row_metrics_dict


    
    def generate_sentences_and_track_metrics_batch(self,logits,original_targets_batch,original_sources_batch,save_track_dict=False):
        """
        Calcula métricas para todo o batch
        """
        batch_bleu_1_list = []
        batch_bleu_2_list = []
        batch_bleu_3_list = []
        batch_bleu_4_list = []
        batch_CIDEr_list = []
        batch_ROUGE_L_list = []
        batch_Glove_Cossine_Similarity_list = []


        #batch
        for i,(original_target,original_source) in enumerate(zip(original_targets_batch,original_sources_batch)):
          #linha
          relevant_logits = logits[i*self.hparams.num_gen_sentences:self.hparams.num_gen_sentences+i*self.hparams.num_gen_sentences]
          gen_target_options_list = [self.hparams.tokenizer.decode(l, skip_special_tokens=True) for l in relevant_logits]
          row_metrics_dict = self.track_metrics_row(original_target=original_target,gen_target_options_list=gen_target_options_list)

          if save_track_dict:
            self.list_dict_track["data"].append(self.build_json_results(context=original_source,
                                  generated_question_list=gen_target_options_list,
                                  target_question_list=original_target,
                                  row_mean_metrics = row_metrics_dict))
          
          batch_bleu_1_list.append(row_metrics_dict['Bleu_1'])
          batch_bleu_2_list.append(row_metrics_dict['Bleu_2'])
          batch_bleu_3_list.append(row_metrics_dict['Bleu_3'])
          batch_bleu_4_list.append(row_metrics_dict['Bleu_4'])
          batch_CIDEr_list.append(row_metrics_dict['CIDEr'])
          batch_ROUGE_L_list.append(row_metrics_dict['ROUGE_L'])
          batch_Glove_Cossine_Similarity_list.append(row_metrics_dict['Glove_Cossine_Similarity'])


        batch_metrics_dict = {"Batch_Bleu_1":np.mean(batch_bleu_1_list),
                              "Batch_Bleu_2":np.mean(batch_bleu_2_list),
                              "Batch_Bleu_3":np.mean(batch_bleu_3_list),
                              "Batch_Bleu_4":np.mean(batch_bleu_4_list),
                              "Batch_CIDEr":np.mean(batch_CIDEr_list),
                              "Batch_ROUGE_L":np.mean(batch_ROUGE_L_list),
                              "Batch_Glove_Cossine_Similarity":np.mean(batch_Glove_Cossine_Similarity_list)
                             }

        return batch_metrics_dict
Пример #12
0
def gen_compare(cider_in, table_in, summary_ours_in, summary_with_unk,
                summary_gold_in, file_out):
    '''
    gen file for error case compare
    '''

    with open(table_in) as f:
        table = f.readlines()

    with open(summary_ours_in) as f:
        res_ours = f.readlines()

    with open(summary_with_unk) as f:
        res_ours_unk = f.readlines()

    with open(summary_gold_in) as f:
        res_gold = f.readlines()

    with open(cider_in) as f:
        res_cider = json.load(f)
        score_cider = res_cider["CIDEr"]
        score_cider_d = res_cider["CIDErD"]

        print(len(score_cider_d))

    nlgeval = NLGEval()

    out = open(file_out, "w")
    i = 0
    num_write = 0
    avg_len_right = 0.0
    avg_len_wrong = 0.0

    for this_cider, this_box, this_ours, this_unk, this_gold in zip(
            score_cider_d, table, res_ours, res_ours_unk, res_gold):

        references = [this_gold.strip().split()]
        hypothesis = this_ours.strip().split()
        this_bleu = sentence_bleu(references, hypothesis)
        this_cider = float(this_cider)
        metrics_dict = nlgeval.compute_individual_metrics([this_gold.strip()],
                                                          this_ours.strip())
        this_meteor = metrics_dict["meteor"]

        i += 1

        #if this_bleu < 0.1:
        #if this_bleu > 0.1 and this_bleu < 0.2:
        #if this_bleu > 0.2 and this_bleu < 0.3:
        if this_cider < 1.0:

            avg_len_wrong += len(this_box.strip().split("\t"))
            num_write += 1
            out.write("########## Test " + str(i) + " ##########\n")

            for each_token in this_box.strip().split("\t"):
                out.write(each_token + "\n")

            out.write("########## Gold ##########\n")
            out.write(this_gold.strip() + "\n")
            out.write("########## Ours ##########\n")
            out.write(this_ours.strip() + "\n")
            out.write("########## Ours with unk ##########\n")
            out.write(this_unk.strip() + "\n")
            out.write("########## bleu ##########\n")
            out.write(str(this_bleu) + "\n")
            out.write("########## cider-d ##########\n")
            out.write(str(this_cider) + "\n")
            out.write("########## meteor ##########\n")
            out.write(str(this_meteor) + "\n")
            out.write("\n")

        else:
            avg_len_right += len(this_box.strip().split("\t"))

    out.close()

    print("All: ", i)
    print("Write: ", num_write)

    print("avg_len_wrong: ", float(avg_len_wrong) / num_write)
    print("avg_len_right: ", float(avg_len_right) / (i - num_write))
Пример #13
0
def finetune_model(num_epochs=20,
                   k_fold=10,
                   predict_test=True,
                   calculate_scores=False):
    model = seq2seq(mode='finetune')
    saver = tf.train.Saver()
    tf.logging.set_verbosity(logging.INFO)
    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True

    #Reading data from csv file
    data = pd.read_csv(model.active_passive_csv_file, usecols=[2, 3])
    print(data.head())
    data_clean = data.applymap(clean_str)

    if calculate_scores is True:
        nlgeval = NLGEval(no_skipthoughts=True, no_glove=False)
        #bleu_1_scores, bleu_2_scores, bleu_3_scores, bleu_4_scores = [],[],[],[]
    bleu_1_scores_top_beam, bleu_2_scores_top_beam, bleu_3_scores_top_beam, bleu_4_scores_top_beam = [],[],[],[]
    meteor_scores_top_beam = []
    bleu_4_best_of_10_result_scores, meteor_best_of_10_result_scores = [], []
    rouge_scores_top_beam, rouge_scores_best_of_10_result_scores = [], []
    greedy_embedding_top_beam, greedy_embedding_best_of_10_result_scores = [],[]
    ter_top_beam, ter_best_of_10_result_scores_with_best_bleu, ter_best_of_10_result_scores_with_best_meteor = [],[],[]

    for i in range(k_fold):
        iter_no = 0
        train, test = train_test_split(data_clean, test_size=0.2)
        print('\nTrain head: ', train.head())
        print('\n\nTest head: ', test.head())
        next_batch = model.train_batchify_pandas(train)

        print(
            '\n\nRetraining again for #{} validations----------------------------------\n\n'
            .format(i))
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(
                os.path.dirname(os.path.join(model.model_dir,
                                             model.model_dir)))
            if ckpt and ckpt.model_checkpoint_path:
                print("Found checkpoints. Restoring variables..")
                saver.restore(sess, ckpt.model_checkpoint_path)
                print("Checkpoints restored...")

            for epoch in range(num_epochs):
                train_cost = 0
                train_op = model.get_train_op(epoch)
                print('Training op --- ', train_op)
                for _ in range(model.samples // model.batch_size):
                    iter_no = iter_no + 1
                    batch = next_batch()
                    feed = batch
                    feed[model.iter_no] = iter_no
                    v = sess.run([train_op, model.loss, model.global_step],
                                 batch)
                    train_cost += v[1]
                    print('Global Step: ', v[2], ', Step Loss: ', v[1])
    #                print(v[3][-1][-1])

                train_cost /= model.samples / model.batch_size
                print("Epoch", (epoch + 1), train_cost)

            print("Saving model at " + str(v[2]) + " iterations...")
            saver.save(sess,
                       os.path.join(model.new_dir, 'model.ckpt'),
                       global_step=model.global_step)
            print("Model saved..")

            #Running for test split and compute BLEU scores
            test_next_batch = model.train_batchify_pandas(test)
            hyp_top_beam_result, hyp_best_of_10_beam_result, ref = [], [], []
            hyp_best_of_10_meteor_result, hyp_best_of_10_rouge_result = [], []
            hyp_best_of_10_greedy_emb_result = []

            #        hyp, ref = [],[]
            print("Predictions of test batch: \n")
            for _ in range(len(test) // model.batch_size):
                test_batch = test_next_batch()
                temp_ref = [
                    model.to_str(string).replace('</S>',
                                                 '').replace('<S>',
                                                             '').strip()
                    for string in test_batch[model.outputs]
                ]
                ref.extend(temp_ref)

                v = sess.run([model.global_step, model.predictions],
                             test_batch)
                for i, pred in enumerate(v[1]['preds']):
                    max_bleu = 0.0
                    max_meteor = 0.0
                    max_rouge = 0.0
                    max_greedy_emb = 0.0

                    best_bleu_str = ''
                    best_meteor_str = ''
                    best_rouge_str = ''
                    best_greedy_emb_str = ''

                    for j in range(model.beam_width):
                        predicted = model.to_str(pred[:, j]).replace(
                            '</S>', '').replace('<S>', '').strip()
                        if j == 0:
                            hyp_top_beam_result.append(predicted)

                        metrics_dict = nlgeval.compute_individual_metrics(
                            [temp_ref[i]], predicted)
                        if metrics_dict['Bleu_4'] >= max_bleu:
                            max_bleu = metrics_dict['Bleu_4']
                            best_bleu_str = predicted

                        if metrics_dict['METEOR'] >= max_meteor:
                            max_meteor = metrics_dict['METEOR']
                            best_meteor_str = predicted

                        if metrics_dict['ROUGE_L'] >= max_rouge:
                            max_rouge = metrics_dict['ROUGE_L']
                            best_rouge_str = predicted

                        if metrics_dict[
                                'GreedyMatchingScore'] >= max_greedy_emb:
                            max_greedy_emb = metrics_dict[
                                'GreedyMatchingScore']
                            best_greedy_emb_str = predicted

                    hyp_best_of_10_beam_result.append(best_bleu_str)
                    hyp_best_of_10_meteor_result.append(best_meteor_str)
                    hyp_best_of_10_rouge_result.append(best_rouge_str)
                    hyp_best_of_10_greedy_emb_result.append(
                        best_greedy_emb_str)
                    """
                    predicted = model.to_str(seq).replace('</S>','').replace('<S>','').strip()
    #                print(i,predicted)
                    hyp.append(predicted)
                    """

    #        print(ref,hyp)
            print('Ref len: ', len(ref), ', Hyp len: ',
                  len(hyp_top_beam_result))
            metrics_dict_top_beam_result = nlgeval.compute_metrics(
                [ref], hyp_top_beam_result)
            metrics_dict_best_of_5_beam = nlgeval.compute_metrics(
                [ref], hyp_best_of_10_beam_result)
            metrics_dict_best_of_5_meteor = nlgeval.compute_metrics(
                [ref], hyp_best_of_10_meteor_result)
            metrics_dict_best_of_5_rouge = nlgeval.compute_metrics(
                [ref], hyp_best_of_10_rouge_result)
            metrics_dict_best_of_5_greedy_emb = nlgeval.compute_metrics(
                [ref], hyp_best_of_10_greedy_emb_result)

            ter_top = calculate_ter(ref, hyp_top_beam_result)
            ter_bleu = calculate_ter(ref, hyp_best_of_10_beam_result)
            ter_meteor = calculate_ter(ref, hyp_best_of_10_meteor_result)

            print("BLEU 1-4 : ", metrics_dict_top_beam_result['Bleu_1'],
                  metrics_dict_top_beam_result['Bleu_2'],
                  metrics_dict_top_beam_result['Bleu_3'],
                  metrics_dict_top_beam_result['Bleu_4'])

            print("METEOR, ROUGE_L, GreedyEmbedding : ",
                  metrics_dict_top_beam_result['METEOR'],
                  metrics_dict_top_beam_result['ROUGE_L'],
                  metrics_dict_top_beam_result['GreedyMatchingScore'])

            print(
                "CIDER, EmbeddingAverageCosineSimilaity, VectorExtremaCosineSimilarity : ",
                metrics_dict_top_beam_result['CIDEr'],
                metrics_dict_top_beam_result[
                    'EmbeddingAverageCosineSimilairty'],
                metrics_dict_top_beam_result['VectorExtremaCosineSimilarity'])
            print("TER top beam, TER Bleu, TER METEOR", ter_top, ter_bleu,
                  ter_meteor)

            #        metrics_dict = nlgeval.compute_metrics([ref],hyp)
            # Top beam search results
            bleu_1_scores_top_beam.append(
                metrics_dict_top_beam_result['Bleu_1'])
            bleu_2_scores_top_beam.append(
                metrics_dict_top_beam_result['Bleu_2'])
            bleu_3_scores_top_beam.append(
                metrics_dict_top_beam_result['Bleu_3'])
            bleu_4_scores_top_beam.append(
                metrics_dict_top_beam_result['Bleu_4'])
            meteor_scores_top_beam.append(
                metrics_dict_top_beam_result['METEOR'])
            rouge_scores_top_beam.append(
                metrics_dict_top_beam_result['ROUGE_L'])
            greedy_embedding_top_beam.append(
                metrics_dict_top_beam_result['GreedyMatchingScore'])
            ter_top_beam.append(ter_top)

            # Best of 5 search results
            ter_best_of_10_result_scores_with_best_bleu.append(ter_bleu)
            ter_best_of_10_result_scores_with_best_meteor.append(ter_meteor)
            bleu_4_best_of_10_result_scores.append(
                metrics_dict_best_of_5_beam['Bleu_4'])
            meteor_best_of_10_result_scores.append(
                metrics_dict_best_of_5_meteor['METEOR'])
            rouge_scores_best_of_10_result_scores.append(
                metrics_dict_best_of_5_rouge['ROUGE_L'])
            greedy_embedding_best_of_10_result_scores.append(
                metrics_dict_best_of_5_greedy_emb['GreedyMatchingScore'])

    print('Bleu 1 scores top: ', bleu_1_scores_top_beam)
    print('Bleu 2 scores top: ', bleu_2_scores_top_beam)
    print('Bleu 3 scores top: ', bleu_3_scores_top_beam)
    print('Bleu 4 scores top: ', bleu_4_scores_top_beam)
    print('METEOR scores top: ', meteor_scores_top_beam)
    print('ROUGE scores top: ', rouge_scores_top_beam)
    print('Greedy Embedding top: ', greedy_embedding_top_beam)
    print('Translation Edit Rate top: ', ter_top_beam)
    print('\n\nBest beam results:---------\n')
    print('Bleu 4 scores best beam : ', bleu_4_best_of_10_result_scores)
    print('METEOR scores best beam : ', meteor_best_of_10_result_scores)
    print('ROUGE scores best beam: ', rouge_scores_best_of_10_result_scores)
    print('Greedy Embedding best beam: ',
          greedy_embedding_best_of_10_result_scores)
    print('Translation Edit Rate Bleu: ',
          ter_best_of_10_result_scores_with_best_bleu)
    print('Translation Edit Rate METEOR: ',
          ter_best_of_10_result_scores_with_best_meteor)
Пример #14
0
Файл: stat.py Проект: voidful/DG
print(len(data_dict_beam))
print(len(data_dict_ori))
print(len(data_dict_random))

n = NLGEval(
    metrics_to_omit=['METEOR', 'EmbeddingAverageCosineSimilairty', 'SkipThoughtCS', 'VectorExtremaCosineSimilarity',
                     'GreedyMatchingScore'])
# n = NLGEval()

print("====similarity between gold and generated distractor====")
overall_dict_jaccard = defaultdict(list)
overall_result_jaccard = dict()
for k, v in data_dict_jaccard.items():
    example_dict = defaultdict(list)
    for i in v['options']:
        metrics_dict = n.compute_individual_metrics(v["gold_distractor"], i)
        for mk, mv in metrics_dict.items():
            example_dict[mk].append(mv)
    for mk, mv in example_dict.items():
        # if not 0 in mv and not any(n < 0 for n in mv):
        #     overall_dict_jaccard[mk].append(statistics.harmonic_mean(mv))
        # else:
        #     overall_dict_jaccard[mk].append(np.mean(mv))
        overall_dict_jaccard[mk].append(np.mean(mv))
for mk, mv in overall_dict_jaccard.items():
    overall_result_jaccard[mk] = np.mean(mv)
print("jaccard", overall_result_jaccard)

overall_dict_ori = defaultdict(list)
overall_result_ori = dict()
for k, v in data_dict_ori.items():
Пример #15
0
def evaluate(gen, oracle, output, no_score=False):
    if not no_score:
        n = NLGEval()

    print('Start evaluation...')
    total_samples = oracle.test_size

    with open(output, 'w') as fout:
        writer = csv.writer(fout, delimiter='\t')

        if not no_score:
            writer.writerow([
                'original (cond)', 'sample (pos)', 'generated (neg)', 'BLEU',
                'METEOR'
            ])
            total_bleu = 0
            total_meteor = 0
        else:
            writer.writerow(
                ['original (cond)', 'sample (pos)', 'generated (neg)'])

        i = 0
        end_of_dataset = False
        while not end_of_dataset:
            # Retrieve test sample from test set
            pos, pos_len, cond_id, end_of_dataset = oracle.sample(1, gpu=False)
            cond, cond_len = oracle.fetch_cond_samples(cond_id, gpu=False)
            pos, pos_len, cond, cond_len = pos[0], pos_len[0], cond[
                0], cond_len[0]

            # Generate paraphrase
            generated = gen.sample_until_end(cond, max_len=100)

            # Turn to string
            pos_str = tensor_to_sent(pos, oracle)
            cond_str = tensor_to_sent(cond, oracle)
            generated_str = tensor_to_sent(generated, oracle)

            # Calculate BLEU score
            if not no_score:
                if len(generated_str) > 0:
                    scores = n.compute_individual_metrics([cond_str, pos_str],
                                                          generated_str)
                    bleu = scores['Bleu_2']
                    meteor = scores['METEOR']
                else:
                    bleu = 0
                    meteor = 0

                total_bleu += bleu
                total_meteor += meteor

            # Output to tsv file
            if not no_score:
                writer.writerow(
                    [cond_str, pos_str, generated_str, bleu, meteor])
            else:
                writer.writerow([cond_str, pos_str, generated_str])

            i += 1
            if i % int(total_samples / 5) == 0:  # Print progress every 5%
                print('.', end='')
                sys.stdout.flush()

        if not no_score:
            avg_bleu = total_bleu / i
            avg_meteor = total_meteor / i
            print(
                f'Average BLEU score: {avg_bleu}\tAverage METEOR score: {avg_meteor}'
            )
Пример #16
0
with open(hyp_file, "r") as f:
    hyp_dict = {
        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
        for line in f.readlines()
    }

keys = [k for k, v in hyp_dict.items()]
labels = [ref_dict[k] for k, _ in hyp_dict.items()]
decoded_preds = [v for k, v in hyp_dict.items()]

metric = load_metric("bertscore")
result_bert = metric.compute(
    predictions=decoded_preds,
    references=labels,
    lang="en",
)

nlg = NLGEval()  # loads the models
print("Key", "\t", "METEOR", "\t", "ROUGE-L")
for (key, ref, hyp) in zip(keys, labels, decoded_preds):
    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
refs = [[x] for x in labels]
metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
metric = load_metric("rouge")
result = metric.compute(predictions=decoded_preds, references=labels)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

print(f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}")
Пример #17
0
    def test_compute_metrics_oo(self):
        # Create the object in the test so that it can be garbage collected once the test is done.
        n = NLGEval()

        # Individual Metrics
        scores = n.compute_individual_metrics(
            ref=["this is a test", "this is also a test"],
            hyp="this is a good test")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.980075,
                               scores['EmbeddingAverageCosineSimilairty'],
                               places=5)
        self.assertAlmostEqual(0.94509,
                               scores['VectorExtremaCosineSimilarity'],
                               places=5)
        self.assertAlmostEqual(0.960771,
                               scores['GreedyMatchingScore'],
                               places=5)
        self.assertEqual(11, len(scores))

        scores = n.compute_metrics(
            ref_list=[
                [
                    "this is one reference sentence for sentence1",
                    "this is a reference sentence for sentence2 which was generated by your model"
                ],
                [
                    "this is one more reference sentence for sentence1",
                    "this is the second reference sentence for sentence2"
                ],
            ],
            hyp_list=[
                "this is the model generated sentence1 which seems good enough",
                "this is sentence2 which has been generated by your model"
            ])
        self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.295797, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.88469,
                               scores['EmbeddingAverageCosineSimilairty'],
                               places=5)
        self.assertAlmostEqual(0.568696,
                               scores['VectorExtremaCosineSimilarity'],
                               places=5)
        self.assertAlmostEqual(0.784205,
                               scores['GreedyMatchingScore'],
                               places=5)
        self.assertEqual(11, len(scores))

        # Non-ASCII tests.
        scores = n.compute_individual_metrics(
            ref=["Test en français.", "Le test en français."],
            hyp="Le test est en français.")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.0000051, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.48372379050300296, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.9192341566085815,
                               scores['SkipThoughtCS'],
                               places=5)
        self.assertAlmostEqual(0.906562,
                               scores['EmbeddingAverageCosineSimilairty'],
                               places=5)
        self.assertAlmostEqual(0.815158,
                               scores['VectorExtremaCosineSimilarity'],
                               places=5)
        self.assertAlmostEqual(0.940959,
                               scores['GreedyMatchingScore'],
                               places=5)
        self.assertEqual(11, len(scores))

        scores = n.compute_individual_metrics(ref=["テスト"], hyp="テスト")
        self.assertAlmostEqual(0.99999999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(1.0, scores['METEOR'], places=3)
        self.assertAlmostEqual(1.0, scores['ROUGE_L'], places=3)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=3)
        self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3)
        self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3)
        self.assertEqual(11, len(scores))
Пример #18
0
def eval(eval_filename, vocab_filename, alias2scientific_filename):
    def remove_stopwords(sent, stop_word_set):
        items = sent.split()
        items = [ite for ite in items if ite not in stop_word_set]
        return " ".join(items)

    with open("data/stopwords.txt") as f:
        stopwords = f.read().strip().split()
        stopwords = set(stopwords)

    bleu_nlgeval = NLGEval(metrics_to_omit=[
        "METEOR", "CIDEr", "ROUGE_L", "SkipThoughtCS",
        "EmbeddingAverageCosineSimilairty", "VectorExtremaCosineSimilarity",
        "GreedyMatchingScore"
    ])
    rouge_eval = RougeEval()
    disease2x = pandas.read_csv(vocab_filename)
    disease2x = disease2x[disease2x["Is_know"] > 0]
    disease2x = dict(zip(list(disease2x["Word"]), list(disease2x["Is_know"])))
    distinct_eval = DistinctEval(grams=[1, 2])

    with open(eval_filename) as f:
        sessions = json.load(f)

    gths = [[episode["gth"] for episode in session["session"]]
            for session in sessions]
    hyps = [[episode["hyp"] for episode in session["session"]]
            for session in sessions]
    entity_gths = [[
        " ".join([i for i in x.split(" ") if i in disease2x]) for x in y
    ] for y in gths]
    entity_hyps = [[
        " ".join([i for i in x.split(" ") if i in disease2x]) for x in y
    ] for y in hyps]

    def flat(lists):
        tmp = []
        for items in lists:
            tmp += items
        return tmp

    gths = flat(gths)
    hyps = flat(hyps)
    entity_gths = flat(entity_gths)
    entity_hyps = flat(entity_hyps)

    gths = [remove_stopwords(gth, stopwords) for gth in gths]
    hyps = [remove_stopwords(hyp, stopwords) for hyp in hyps]

    ret_metrics = OrderedDict()
    ret_metric = OrderedDict()

    bleu_score_matrix = [
        bleu_nlgeval.compute_individual_metrics([gth], hyp)
        for gth, hyp in zip(gths, hyps)
    ]
    b2s = [b["Bleu_2"] for b in bleu_score_matrix]
    ret_metrics["B@2"] = b2s
    bleu_score = bleu_nlgeval.compute_metrics([gths], hyps)
    b2 = bleu_score["Bleu_2"]
    ret_metric["B@2"] = b2
    rouge1, rouge2, r1s, r2s = rouge_eval.rouge_score(hyps,
                                                      gths,
                                                      ret_matrix=True)
    ret_metrics["R@2"] = r2s
    ret_metric["R@2"] = rouge2
    dist_scores = distinct_eval.distinct_score(hyps)
    ret_metric["D@1"] = dist_scores[0]
    ret_metric["D@2"] = dist_scores[1]
    ret_metrics["D@1"] = float("nan")
    ret_metrics["D@2"] = float("nan")
    eps = 1e-24

    def compute_f1(p, r):
        return 2 * p * r / (p + r + eps)

    overlapped_entity = [[i for i in x.split() if i in y.split()]
                         for x, y in zip(entity_hyps, entity_gths)]
    overlapped_entity = [list(set(x)) for x in overlapped_entity]
    hyp_entity = [set(y.split()) for y in entity_hyps]
    gth_entity = [set(y.split()) for y in entity_gths]
    entity2prf = OrderedDict()
    for oe, he, ge in zip(overlapped_entity, hyp_entity, gth_entity):
        for e in oe:
            if e not in entity2prf:
                entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0}
            entity2prf[e]["TP"] += 1

        for e in he:
            if e not in entity2prf:
                entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0}
            if e not in oe:
                entity2prf[e]["FP"] += 1

        for e in ge:
            if e not in entity2prf:
                entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0}
            if e not in oe:
                entity2prf[e]["FN"] += 1

    counter = Counter()
    for gth in gth_entity:
        counter.update(gth)
    need_entity_ind = [x[0] for x in counter.most_common() if x[1] > 5]
    print("len(need_entity_ind) = {}".format(len(need_entity_ind)))
    ret_metrics["ma-P"] = [
        entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FP"] + eps)
        for e in need_entity_ind
    ]
    ret_metrics["ma-R"] = [
        entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FN"] + eps)
        for e in need_entity_ind
    ]
    ret_metrics["ma-F1"] = [
        compute_f1(p, r)
        for (p, r) in zip(ret_metrics["ma-P"], ret_metrics["ma-R"])
    ]
    ret_metric["ma-P"] = float(np.mean(ret_metrics["ma-P"]))
    ret_metric["ma-R"] = float(np.mean(ret_metrics["ma-R"]))
    ret_metric["ma-F1"] = compute_f1(ret_metric["ma-P"], ret_metric["ma-R"])
    mi_precision = [
        len(x) / (len(y) + 1e-14) for x, y in zip(
            overlapped_entity, [set(y.split()) for y in entity_hyps])
    ]
    mi_recall = [
        len(x) / (len(y) + 1e-14) for x, y in zip(
            overlapped_entity, [set(y.split()) for y in entity_gths])
    ]
    gth_n = [len(set(ws.split())) for ws in entity_gths]
    hyp_n = [len(set(ws.split())) for ws in entity_hyps]
    ret_metric["mi-P"] = np.sum([p * w for (p, w) in zip(mi_precision, hyp_n)
                                 ]) / np.sum(hyp_n)
    ret_metric["mi-R"] = np.sum([r * w for (r, w) in zip(mi_recall, gth_n)
                                 ]) / np.sum(gth_n)
    ret_metric["mi-F1"] = compute_f1(ret_metric["mi-P"], ret_metric["mi-R"])
    ret_metrics["mi-P"] = mi_precision
    ret_metrics["mi-R"] = mi_recall
    ret_metrics["mi-F1"] = [
        compute_f1(p, r) for (p, r) in zip(mi_precision, mi_recall)
    ]
    with open("data/word2embedding.txt") as f:
        content = f.read().strip()
    single_word2embedding = {}
    for line in content.split("\n"):
        item = line.split()
        word = item[0]
        embedding = np.asarray([float(x) for x in item[1:]])
        single_word2embedding[word] = embedding
    alias2scientific = json.load(open(alias2scientific_filename))
    padding_embed = np.zeros(768)

    hyp_emb_avg = [
        np.asarray([
            np.asarray([
                single_word2embedding.get(w, padding_embed)
                for w in alias2scientific.get(e, e)
            ]).mean(0) for e in entity_hyp.split()
        ]).mean(0) if len(entity_hyp.split()) > 0 else padding_embed
        for entity_hyp in entity_hyps
    ]
    gth_emb_avg = [
        np.asarray([
            np.asarray([
                single_word2embedding.get(w, padding_embed)
                for w in alias2scientific.get(e, e)
            ]).mean(0) for e in entity_gth.split()
        ]).mean(0) if len(entity_gth.split()) > 0 else padding_embed
        for entity_gth in entity_gths
    ]
    eas = [cosine_sim(h, g) for h, g in zip(hyp_emb_avg, gth_emb_avg)]
    ea = float(np.mean(eas))
    ret_metrics["EA"] = eas
    ret_metric["EA"] = ea

    hyp_emb_means = [[
        np.asarray([
            single_word2embedding.get(w, padding_embed)
            for w in alias2scientific.get(e, e)
        ]).mean(0) for e in entity_hyp.split()
    ] if len(entity_hyp.split()) > 0 else [padding_embed]
                     for entity_hyp in entity_hyps]
    gth_emb_means = [[
        np.asarray([
            single_word2embedding.get(w, padding_embed)
            for w in alias2scientific.get(e, e)
        ]).mean(0) for e in entity_gth.split()
    ] if len(entity_gth.split()) > 0 else [padding_embed]
                     for entity_gth in entity_gths]

    def eval_embed_greedy(a, b):
        scores = []

        for j in b:
            score = []
            for i in a:
                s = cosine_sim(i, j)
                score.append(s)
            scores.append(score)

        if len(b) == 1 and b[0].sum() == 0.0:
            return None
        else:
            scores = np.asarray(scores)
            score1 = scores.max(0).mean()
            score2 = scores.max(1).mean()
            return (float(score1) + float(score2)) / 2.0

    eg_scores = [
        x for x in [
            eval_embed_greedy(a, b)
            for (a, b) in zip(hyp_emb_means, gth_emb_means)
        ] if x is not None
    ]
    eg_score = np.asarray(eg_scores).mean()
    ret_metrics["EG"] = eg_scores
    ret_metric["EG"] = eg_score

    return ret_metrics, ret_metric