Пример #1
0
def language_eval(dataset, preds, model_id, split):
    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')

    label = {}
    imgs = json.load(open(dataset, 'r'))
    for img in imgs:
        if img["split"] == split:
            label[img["dataid"]] = img

    pred_ids = []
    with open("eval_results/preds.txt", "w") as text_file:
        for pred in preds:
            pred_id = pred["image_id"]
            pred_txt = pred["caption"]
            pred_ids.append(pred_id)
            text_file.write(pred_txt + "\n")

    with open("eval_results/refer.txt", "w") as text_file:
        for pred_id in pred_ids:
            raw_sent = label[pred_id]["sentences"][0]["raw"].lower().encode(
                'utf-8')
            text_file.write(raw_sent + "\n")

    metrics_dict = compute_metrics(hypothesis='eval_results/preds.txt',
                                   references=['eval_results/refer.txt'])

    return metrics_dict
Пример #2
0
def eval(net, dataloader, beam_size):
    net.eval()
    torch.set_grad_enabled(False)
    torch.backends.cudnn.benchmark = False
    sample_count = 0
    out_file = open("output.txt", "w")
    lab_file_names = ["labels{}.txt".format(i) for i in range(5)]
    lab_files = [open(f, "w") for f in lab_file_names]

    for i, data in enumerate(tqdm(dataloader)):
        image, _, all_labels = data
        image_cuda = image.cuda()
        outputs = net.forward_single_inference(image.cuda(), beam_size)

        sample_count += image.shape[0]
        for j in range(image.shape[0]):
            h = outputs[j]
            ls = []
            for p, l in enumerate(all_labels[j]):
                ll = l[1:]
                ls.append(ll)
                l_text = utils.get_output_text(net.vectorizer, ll)
                lab_files[p].write(l_text + os.linesep)
            out_text = utils.get_output_text(net.vectorizer, h)
            out_file.write(out_text + os.linesep)

    out_file.close()
    for l in lab_files:
        l.close()
    metrics_dict = compute_metrics(references=lab_file_names,
                                   hypothesis='output.txt',
                                   no_overlap=False,
                                   no_skipthoughts=True,
                                   no_glove=True)
    return metrics_dict
Пример #3
0
def evaluation():
    from nlgeval import compute_metrics
    metrics_dict = compute_metrics(hypothesis=prediction_file_path,
                                   references=[reference_file_path])
    print(metrics_dict)

    from measures import selfbleu
    selfbleuobj = selfbleu.SelfBleu(prediction_file_path, 1)
    print("selfbleu-1", selfbleuobj.get_score())
    selfbleuobj = selfbleu.SelfBleu(prediction_file_path, 2)
    print("selfbleu-2", selfbleuobj.get_score())
    selfbleuobj = selfbleu.SelfBleu(prediction_file_path, 3)
    print("selfbleu-3", selfbleuobj.get_score())
    selfbleuobj = selfbleu.SelfBleu(prediction_file_path, 4)
    print("selfbleu-4", selfbleuobj.get_score())

    # embedding_metrics.metrics_embeddings(reference_file_path,
    #     prediction_file_path)

    eval_log = {}
    for metric in ['bleu', 'rouge', 'accuracy', 'word_accuracy']:
        score = evaluation_utils.evaluate(reference_file_path,
                                          prediction_file_path, metric)
        eval_log[metric] = score
        if metric == "bleu":
            print(
                "  bleu-1, bleu-2, bleu-3, bleu-4: %.5f,  %.5f,  %.5f,  %.5f" %
                score)
        elif metric == "rouge":
            print("  rouge-1, rouge-2, rouge-l: %.5f,  %.5f,  %.5f" % score)
        else:
            print("  %s: %.5f" % (metric, score))
Пример #4
0
 def test_compute_metrics(self):
     # The example from the README.
     root_dir = os.path.join(os.path.dirname(__file__), '..', '..')
     hypothesis = os.path.join(root_dir, 'examples/hyp.txt')
     references = os.path.join(root_dir, 'examples/ref1.txt'), os.path.join(
         root_dir, 'examples/ref2.txt')
     scores = nlgeval.compute_metrics(hypothesis, references)
     self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5)
     self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5)
     self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5)
     self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5)
     self.assertAlmostEqual(0.295797, scores['METEOR'], places=5)
     self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5)
     self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5)
     self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5)
     self.assertAlmostEqual(0.88469,
                            scores['EmbeddingAverageCosineSimilairty'],
                            places=5)
     self.assertAlmostEqual(0.568696,
                            scores['VectorExtremaCosineSimilarity'],
                            places=5)
     self.assertAlmostEqual(0.784205,
                            scores['GreedyMatchingScore'],
                            places=5)
     self.assertEqual(11, len(scores))
Пример #5
0
def sample_test_compute_metrics():
    # The example from the README.
    root_dir = os.path.join(os.path.dirname(__file__), 'evaluation_folder')
    hypothesis = os.path.join(root_dir, 'hyp.txt')
    references = [os.path.join(root_dir, 'ref1.txt')]
    scores = nlgeval.compute_metrics(hypothesis, references)
    return
Пример #6
0
def eval_top1_acc(pred, ref, step):

    all_texts = remove_mask(pred)
    top1 = [all_texts[i] for i in range(0, len(all_texts), step)]
    refs = open(ref, 'r').readlines()
    metrics_dict = compute_metrics(hypothesis=top1, references=refs)
    metrics_dict = {f'top1_{k}': v for k, v in metrics_dict.items()}

    return metrics_dict
Пример #7
0
def get_all_scores(reply, answer, scores):
    from nlgeval import compute_metrics
    metrics_dict = compute_metrics(hypothesis=reply,
                                   references=[answer])

    print(metrics_dict)

    import codecs
    with codecs.open(scores, 'w', encoding='utf-8') as fo:
        fo.write(str(metrics_dict))
Пример #8
0
def main():
    p = get_params()
    hypothesis = p.i
    references = [os.path.join(p.dataset_dir, "test.q.%s.lc" % p.lang)]
    print(
        compute_metrics(hypothesis,
                        references,
                        no_overlap=False,
                        no_skipthoughts=True,
                        no_glove=True))
Пример #9
0
def getMetrics(df, candidate, reference):
	with open('ref1.txt','w') as ref:
		for line in list(reference):
			ref.writelines(line+'\n')

	with open('hyp.txt','w') as hyp:
		for line in list(candidate):
			hyp.writelines(line+'\n')

	metrics_dict = compute_metrics(hypothesis='hyp.txt', references=['ref1.txt'])
	return metrics_dict
Пример #10
0
def test_compute_metrics(mode='model'):
    # The example from the README.
    root_dir = Path(os.path.dirname(__file__))
    root_dir = root_dir / 'evaluation_folder'
    hypothesis = root_dir / '{}_base_gen.txt'.format(mode)
    references = [root_dir / 'human_base_gen.txt']

    assert hypothesis.exists()
    assert references[0].exists()
    scores = nlgeval.compute_metrics(hypothesis, references)
    return
Пример #11
0
def compute(annotations, outputs):
    with open('ref.txt', 'w') as f:
        f.write('\n'.join(annotations))

    with open('output.txt', 'w') as f:
        f.write('\n'.join(outputs))

    metrics_dict = compute_metrics(hypothesis='output.txt',
                                   references=['ref.txt'],
                                   no_skipthoughts=True,
                                   no_glove=True)
    print()
    return metrics_dict
Пример #12
0
    def score(self):
        hyp_test_str = "\n".join(
            [h.replace('\n', '') for h in self.hypothesis])
        ref_test_str = "\n".join(
            [r.replace('\n', '') for r in self.references])
        with open("/tmp/hyp.txt", 'w') as fd_hyp:
            fd_hyp.write(hyp_test_str)
            fd_hyp.close()
        with open("/tmp/ref.txt", 'w') as fd_ref:
            fd_ref.write(ref_test_str)
            fd_ref.close()

        self.metrics_dict = compute_metrics(hypothesis="/tmp/hyp.txt",
                                            references=["/tmp/ref.txt"],
                                            no_glove=True,
                                            no_skipthoughts=True)
Пример #13
0
def eval_topk_acc(pred, ref, step):

    ref_by_idx = open(ref, 'r').readlines()
    all_texts = remove_mask(pred)
    gen_by_idx = [
        all_texts[i:i + step] for i in range(0, len(all_texts), step)
    ]
    gens = []
    for i in range(len(ref_by_idx)):
        ref = ref_by_idx[i]
        gen = gen_by_idx[i]
        metric = [cie(ref, g)['Bleu_4'] for g in gen]
        gens.append(gen[np.argmax(metric)])
    metrics_dict = compute_metrics(hypothesis=gens, references=ref_by_idx)
    metrics_dict = {f'topk_{k}': v for k, v in metrics_dict.items()}

    return metrics_dict
Пример #14
0
from nltk.tokenize import word_tokenize
import codecs
import re
print (sys.argv[1], sys.argv[2])

x = codecs.open(sys.argv[1], encoding="utf-8")
x = x.readlines()
y = codecs.open(sys.argv[2], encoding="utf-8")
y = y.readlines()

x = [" ".join(word_tokenize(i)) for i in x]
y = [" ".join(word_tokenize(i)) for i in y]

x_w = codecs.open(sys.argv[1], "w", encoding="utf-8")
for i in x:
    x_w.write(i.strip().lower() + " \n")
y_w = codecs.open(sys.argv[2], "w", encoding="utf-8")

for i in y:
    y_w.write(i.strip() + "\n")

x_w.close()
y_w.close()
print (compute_metrics(sys.argv[1], [sys.argv[2]]))
x = x = codecs.open(sys.argv[1], encoding="utf-8")
x = x.readlines()
x_w = codecs.open(sys.argv[1], "w", encoding="utf-8")
for i in x:
    x_w.write(i.strip() + "\n")

Пример #15
0
def get_score():
    results = open('./result/tmp.out.txt', 'r', encoding='utf-8').readlines()
    results = results[1015:]
    sources = open('testdata/test.moses.pro', 'r').readlines()
    sources = [x.replace('\n', '') for x in sources]
    ref = pickle.load(open('testdata/test.cus.pkl', 'rb'))
    dics = pickle.load(open('testdata/test_dic.pkl', 'rb'))
    sen2code = pickle.load(open('data/sen2code.pkl', 'rb'))
    sen2code_new = {}
    for key, value in sen2code.items():
        sen2code_new[key.lower()] = value
    del sen2code
    count = 0
    code_exist = {}
    for source in sources:
        try:
            if sen2code_new[source] not in code_exist.keys():
                code_exist[sen2code_new[source]] = 1
            else:
                code_exist[sen2code_new[source]] += 1
        except:
            count += 1
    # print(count)
    # print(len(code_exist.keys()))
    test_subjects = np.array(results)
    test_targets = np.array(ref)
    test_dics = np.array(dics)
    len_sen = [len(nltk.word_tokenize(x)) for x in sources]
    len_sen = np.array(len_sen)
    # print(len_sen.mean(), len_sen.max(), len_sen.min())
    len_spilt = [(0, 100000)]

    for len_current in len_spilt:
        index = np.where((len_sen >= len_current[0]) & (len_sen < len_current[1]))
        ref = test_targets[index].tolist()
        hyp = test_subjects[index].tolist()
        open('./tmp/hyp.txt', 'w', encoding='utf-8').writelines([x for x in hyp])
        ref0 = [x[0] for x in ref]
        ref1 = [x[1] for x in ref]
        ref2 = [x[2] for x in ref]
        ref3 = [x[3] for x in ref]
        open('./tmp/ref0.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref0])
        open('./tmp/ref1.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref1])
        open('./tmp/ref2.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref2])
        open('./tmp/ref3.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref3])

        dics = test_dics[index].tolist()

        metrics_dict = compute_metrics(hypothesis='./tmp/hyp.txt',
                                       references=['./tmp/ref0.txt', './tmp/ref1.txt', './tmp/ref2.txt', './tmp/ref3.txt'],
                                       no_glove=True, no_overlap=False, no_skipthoughts=True)
        # print(metrics_dict)
        hyp = [nltk.word_tokenize(x) for x in hyp]
        hit = count_hit(hyp, dics)
        #     hit=1
        com = count_common(hyp)
        BLEU = (metrics_dict['Bleu_1'] + metrics_dict['Bleu_2'] + metrics_dict['Bleu_3'] + metrics_dict['Bleu_4']) / 4
        if BLEU<0.0001:
            BLEU = 0.0001
        if hit<0.0001:
            hit = 0.0001
        if com<0.0001:
            com = 0.0001
        Ascore = (1 + 2.25 + 4) / (4 / BLEU + 2.25 / hit + 1 / com)
        return BLEU, hit, com, Ascore
Пример #16
0
def fit_epoch(net, dataloader, lr_rate, train, epoch=1):
    if train:
        net.train()
        optimizer = torch.optim.Adam(net.parameters(), lr_rate)
    else:
        net.eval()
    criterion = torch.nn.CrossEntropyLoss(ignore_index=np.where(
        net.vectorizer.vocab == net.vectorizer.pad_token)[0][0])
    torch.set_grad_enabled(train)
    torch.backends.cudnn.benchmark = train
    losses = 0.0
    att_losses = 0.0
    crit_losses = 0.0
    output_images = []
    accs = 0
    sample_count = 0
    label_texts = []
    output_texts = []
    references = []
    hypotheses = []
    out_file = open("output.txt", "w")
    lab_file_names = ["labels{}.txt".format(i) for i in range(5)]
    lab_files = [open(f, "w") for f in lab_file_names]
    att_loss = torch.tensor(0)
    for i, data in enumerate(tqdm(dataloader)):
        image, labels, all_labels = data
        # labels_lens (seq_length, batch_size)
        if image is None:
            continue
        if train:
            optimizer.zero_grad()
        labels_cuda = labels.cuda()

        outputs, atts = net(image.cuda(), labels_cuda[:, :-1])

        if len(atts) != 0:
            att_loss = 5 * ((atts.mean(
                (1, 2)).unsqueeze(-1) - atts.sum(2))**2).mean()

        crit_loss = criterion(outputs, labels_cuda[:, 1:])
        loss = crit_loss + att_loss

        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 5)
            optimizer.step()

        losses += loss.item() * labels.shape[0]
        att_losses += att_loss.item() * labels.shape[0]
        crit_losses += crit_loss.item() * labels.shape[0]

        sample_count += labels_cuda.shape[0]
        for j in range(outputs.shape[0]):
            accs += utils.get_acc(outputs[j].argmax(0), labels_cuda[j, 1:])
            if not train:
                h = outputs[j].detach().argmax(0).cpu().numpy().tolist()
                hypotheses.append(h)
                lab_text = utils.get_output_text(
                    net.vectorizer, labels[j,
                                           1:].detach().cpu().numpy().tolist())
                out_text = utils.get_output_text(net.vectorizer, h)
                ls = []
                for p, l in enumerate(all_labels[j]):
                    ll = l[1:]
                    ls.append(ll)
                    l_text = utils.get_output_text(net.vectorizer, ll)
                    lab_files[p].write(l_text + os.linesep)
                references.append(ls)
                out_file.write(out_text + os.linesep)

        if i >= len(dataloader) - 10:
            image = image[0]
            lab_text = utils.get_output_text(
                net.vectorizer, labels[0, 1:].detach().cpu().numpy().tolist())
            out_text = utils.get_output_text(
                net.vectorizer,
                outputs[0].detach().argmax(0).cpu().numpy().tolist())
            for t, m, s in zip(image, [0.485, 0.456, 0.406],
                               [0.229, 0.224, 0.225]):
                t.mul_(s).add_(m)
            image = image.permute(1, 2, 0).detach().cpu().numpy()

            plt.gcf().subplots_adjust(bottom=0.15)
            plt.imshow(image)
            plt.xlabel(out_text + '\n' + lab_text)
            output_images.append(plt_to_np(plt))

    out_file.close()
    for l in lab_files:
        l.close()
    metrics_dict = {}
    if not train:
        metrics_dict = compute_metrics(references=lab_file_names,
                                       hypothesis='output.txt',
                                       no_overlap=False,
                                       no_skipthoughts=True,
                                       no_glove=True)
    return losses / sample_count, att_losses / sample_count, crit_losses / sample_count, output_images, accs / sample_count, metrics_dict
Пример #17
0
def main(args):
    metrics_dict = compute_metrics(hypothesis=args.pred_file,
                                   references=[args.ref_file],
                                   no_skipthoughts=True,
                                   no_glove=True)
Пример #18
0
def get_score(config, is_val=True):
    results = open('./result/tmp.out.txt', 'r', encoding='utf-8').readlines()

    txt = open(os.path.join(config.data_dir, config.test_file), 'r').read()
    txt = txt.lower()
    txt = txt.split('\n\n')
    if is_val:
        txt = txt[0:len(txt) // 2]
        results = results[0:len(results)//2]
    else:
        txt = txt[len(txt) // 2:]
        results = results[len(results)//2:]
    src = []
    tar = []
    exist_dic = []
    obtain(txt, exist_dic, src, tar, config)
    for u in tar:
        if len(u)<4:
            print(u)

    pickle.dump(exist_dic, open('./data/test_dic.pkl', 'wb'))
    pickle.dump(src, open('./data/test.pro.pkl', 'wb'))
    pickle.dump(tar, open('./data/test.cus.pkl', 'wb'))

    sources = pickle.load(open('./data/test.pro.pkl', 'rb'))
    sources = [x.replace('\n', '') for x in sources]
    ref = pickle.load(open('./data/test.cus.pkl', 'rb'))
    dics = pickle.load(open('./data/test_dic.pkl', 'rb'))
    sen2code = pickle.load(open('./data/sen2code.pkl', 'rb'))
    sen2code_new = {}
    for key, value in sen2code.items():
        sen2code_new[key.lower()] = value
    del sen2code
    count = 0
    code_exist = {}
    for source in sources:
        try:
            if sen2code_new[source] not in code_exist.keys():
                code_exist[sen2code_new[source]] = 1
            else:
                code_exist[sen2code_new[source]] += 1
        except:
            count += 1
    test_subjects = np.array(results)
    test_targets = np.array(ref)
    test_dics = np.array(dics)
    len_sen = [len(nltk.word_tokenize(x)) for x in sources]
    len_sen = np.array(len_sen)
    # print(len_sen.mean(), len_sen.max(), len_sen.min())
    len_spilt = [(0, 100000)]

    for len_current in len_spilt:
        index = np.where((len_sen >= len_current[0]) & (len_sen < len_current[1]))
        ref = test_targets[index].tolist()
        hyp = test_subjects[index].tolist()
        open('./tmp/hyp.txt', 'w', encoding='utf-8').writelines([x for x in hyp])
        ref0 = [x[0] for x in ref]
        ref1 = [x[1] for x in ref]
        ref2 = [x[2] for x in ref]
        ref3 = [x[3] for x in ref]
        open('./tmp/ref0.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref0])
        open('./tmp/ref1.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref1])
        open('./tmp/ref2.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref2])
        open('./tmp/ref3.txt', 'w', encoding='utf-8').writelines([x + '\n' for x in ref3])

        dics = test_dics[index].tolist()

        metrics_dict = compute_metrics(hypothesis='./tmp/hyp.txt',
                                       references=['./tmp/ref0.txt', './tmp/ref1.txt', './tmp/ref2.txt', './tmp/ref3.txt'],
                                       no_glove=True, no_overlap=False, no_skipthoughts=True)
        hyp = [nltk.word_tokenize(x) for x in hyp]
        hit = count_hit(hyp, dics)

        com = count_common(hyp)
        BLEU = (metrics_dict['Bleu_1'] + metrics_dict['Bleu_2'] + metrics_dict['Bleu_3'] + metrics_dict['Bleu_4']) / 4
        if BLEU<0.0001:
            BLEU = 0.0001
        if hit<0.0001:
            hit = 0.0001
        if com<0.0001:
            com = 0.0001
        Ascore = (1 + 2.25 + 4) / (4 / BLEU + 2.25 / hit + 1 / com)
        return BLEU, hit, com, Ascore
Пример #19
0
def Meteor(labels, preds):
    score = getListMeteor(preds, labels)
    return score


# 训练
model.train_model(train_df, eval_data=eval_df, Meteor=Meteor)

# 加载本地训练好的模型
model = BartModel(pretrained_model='Bart/best_model',
                  args=model_args,
                  model_config='Bart/best_model/config.json',
                  vocab_file='Bart/best_model')

# 测试
test_list = test_df['input_text'].tolist()
pred_list = model.predict(test_list)
true_list = test_df['target_text'].tolist()

column_name = ['title']
nl_df = pd.DataFrame(true_list, columns=column_name)
nl_df.to_csv('result/code_true_bart.csv', index=None, header=False)
nl_df = pd.DataFrame(pred_list, columns=column_name)
nl_df.to_csv('result/code_pred_bart.csv', index=None, header=False)

compute_metrics(hypothesis='result/code_pred_bart.csv',
                references=['result/code_true_bart.csv'],
                no_glove=True,
                no_skipthoughts=True)
Пример #20
0
from nlgeval import compute_metrics
import sys
metrics_dict = compute_metrics(hypothesis=sys.argv[1],
                               references=[sys.argv[2]],
                               no_skipthoughts=True)
print(metrics_dict)
Пример #21
0
metrics_dict = compute_metrics(  #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_wordrnn10.txt', 
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_wordrnn07.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_wordrnn05.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_googlelm.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_attentionac.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_noattentionac.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_skipconnections.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_mleseqgan.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_ss.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_seqgan.txt',
    #hypothesis='/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_rankgan.txt',
    hypothesis=
    '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/hypothesis_files/hypothesis_leakgan.txt',
    references=[

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn10_most_similar/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn07_most_similar/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/word_rnn05_most_similar/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/google_lm/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/attention_ac/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/no_attention_ac/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/skip_connections/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/mle_seqgan/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/ss/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/seqgan/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/rankgan/ref10.txt'
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref1.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref2.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref3.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref4.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref5.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref6.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref7.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref8.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref9.txt',
        '/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references_correctedbug_human_labels_majority_vote/leakgan/ref10.txt'

        ###################################

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/skip_connections/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/no_attention_ac/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/attention_ac/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn10_most_similar/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/leakgan/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/rankgan/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/google_lm/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/seqgan/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn07_most_similar/ref10.txt'

        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref1.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref2.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref3.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref4.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref5.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref6.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref7.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref8.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref9.txt',
        #'/srv/disk01/ggarbace/EvaluationProj/fake_text/saved_models_normalized/nlp_metrics_eval/most_similar_references/word_rnn05_most_similar/ref10.txt'
    ])
Пример #22
0
from nlgeval import compute_metrics
metrics_dict = compute_metrics(hypothesis='candidate.txt',
                               references=['reference.txt'])
Пример #23
0
    def translate(self,
                  src,
                  tmpl=None,
                  src2=None,
                  tgt=None,
                  src_seq_len=50,
                  tgt_seq_len=50,
                  batch_size=None,
                  attn_debug=False):
        """
		Translate content of `src_data_iter` (if not None) or `src_path`
		and get gold scores if one of `tgt_data_iter` or `tgt_path` is set.

		Note: batch_size must not be None
		Note: one of ('src_path', 'src_data_iter') must not be None

		Args:
			src_path (str): filepath of source data
			tgt_path (str): filepath of target data or None
			batch_size (int): size of examples per mini-batch
			attn_debug (bool): enables the attention logging

		Returns:
			(`list`, `list`)

			* all_scores is a list of `batch_size` lists of `n_best` scores
			* all_predictions is a list of `batch_size` lists
				of `n_best` predictions
		"""
        assert src is not None

        if batch_size is None:
            raise ValueError("batch_size must be set")

        data = inputters.build_dataset(self.fields,
                                       src=src,
                                       tgt=tgt,
                                       src2=src2,
                                       tmpl=tmpl,
                                       src_seq_len=src_seq_len,
                                       tmpl_seq_len=tgt_seq_len,
                                       tgt_seq_len=tgt_seq_len,
                                       use_filter_pred=self.use_filter_pred)

        cur_device = "cuda" if self.cuda else "cpu"

        data_iter = inputters.OrderedIterator(dataset=data,
                                              device=cur_device,
                                              batch_size=batch_size,
                                              train=False,
                                              sort=False,
                                              sort_within_batch=True,
                                              shuffle=False)

        builder = monmt.translate.TranslationBuilder(data, self.fields,
                                                     self.n_best,
                                                     self.replace_unk, tgt)

        # Statistics
        counter = count(1)
        pred_score_total, pred_words_total = 0, 0
        gold_score_total, gold_words_total = 0, 0

        all_scores = []
        all_predictions = []

        for batch in data_iter:
            batch_data = self.translate_batch(batch,
                                              data,
                                              attn_debug,
                                              fast=self.fast,
                                              pass1=self.pass1)
            translations = builder.from_batch(batch_data, pass1=self.pass1)

            for trans in translations:
                all_scores += [trans.pred_scores[:self.n_best]]
                pred_score_total += trans.pred_scores[0]
                pred_words_total += len(trans.pred_sents[0])
                if tgt is not None:
                    gold_score_total += trans.gold_score
                    gold_words_total += len(trans.gold_sent) + 1

                n_best_preds = [
                    " ".join(pred) for pred in trans.pred_sents[:self.n_best]
                ]
                all_predictions += [n_best_preds]
                self.out_file.write('\n'.join(n_best_preds) + '\n')
                self.out_file.flush()

                if self.verbose:
                    sent_number = next(counter)
                    output = trans.log(sent_number)
                    if self.logger:
                        self.logger.info(output)
                    else:
                        os.write(1, output.encode('utf-8'))

                if attn_debug:
                    preds = trans.pred_sents[0]
                    preds.append('<eos>')
                    attns = trans.attns[0].tolist()
                    srcs = trans.src_raw
                    header_format = "{:>10.10} " + "{:>10.7} " * len(srcs)
                    row_format = "{:>10.10} " + "{:>10.7f} " * len(srcs)
                    output = header_format.format("", *srcs) + '\n'
                    for word, row in zip(preds, attns):
                        max_index = row.index(max(row))
                        row_format = row_format.replace(
                            "{:>10.7f} ", "{:*>10.7f} ", max_index + 1)
                        row_format = row_format.replace(
                            "{:*>10.7f} ", "{:>10.7f} ", max_index)
                        output += row_format.format(word, *row) + '\n'
                        row_format = "{:>10.10} " + "{:>10.7f} " * len(srcs)
                    os.write(1, output.encode('utf-8'))

        if self.report_score:
            msg = self._report_score('PRED', pred_score_total,
                                     pred_words_total)
            if self.logger:
                self.logger.info(msg)
            else:
                print(msg)
            if tgt is not None:
                msg = self._report_score('GOLD', gold_score_total,
                                         gold_words_total)
                if self.logger:
                    self.logger.info(msg)
                else:
                    print(msg)

                if self.report_rouge or self.report_bleu:
                    from nlgeval import compute_metrics
                    compute_metrics(self.pred_file, [self.tgt_file],
                                    logger=self.logger)

        if self.dump_beam:
            import json
            json.dump(self.translator.beam_accum,
                      codecs.open(self.dump_beam, 'w', 'utf-8'))
        return all_scores, all_predictions
Пример #24
0
from nlgeval import compute_metrics, compute_individual_metrics
import numpy as np

metrics_lt = {}

for i in range(20):
    metrics_dict = compute_metrics(hypothesis=f'gen_hyp/hyp_{i}.txt', references=[f'gen_hyp/ref_{i}.txt'])
    for name in metrics_dict:
        if name in metrics_lt:
            metrics_lt[name].append(metrics_dict[name])
        else:
            metrics_lt[name] = [metrics_dict[name]]

import scipy.stats
import math

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = math.sqrt(n) * se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

print('************ overall result ***************')
for name in metrics_lt:
    lt = metrics_lt[name]
    m, lb, hb = mean_confidence_interval(lt)
    print(f'name: {name}, mean: {m}, CI(95%): [{lb}, {hb}]')


Пример #25
0
from nlgeval import compute_metrics
file_generate = "/local/ssd_1/chengzhang/SA_dialog/dialogue/result/nlpcc2017_tune_output/nlpcc2017_diverse_e0_output.txt"
file_truth = '/local/ssd_1/chengzhang/SA_dialog/big_data/nlpcc2017_reference.txt'
generate = []
truth = []
with open(file_generate, "r", encoding="utf-8") as f:
    for line in f.readlines():
        generate.append(line.strip())

with open(file_truth, "r", encoding="utf-8") as f:
    for line in f.readlines():
        truth.append([line.strip()])

print(len(truth))
print(len(generate))
print("BLUE score:",
      compute_metrics(hypothesis=file_generate, references=[file_truth]))
Пример #26
0
    )
    parser.add_argument('--result_dir',
                        default='eval_seq2seq15200',
                        help='directory of test_pred to save.')
    args = parser.parse_args()
    argparams = vars(args)

    ckpt_name = 'BS_test_data-ckpt_from-' + argparams['result_dir']
    test_genarated_file = '../{}/{}/{}'.format(argparams['exp_name'],
                                               ckpt_name,
                                               'test_data_results.txt')
    test_plot_file = '{}/{}'.format(argparams['data_path'], 'test_plot.txt')
    test_ending_file = '{}/{}'.format(argparams['data_path'],
                                      'test_ending.txt')

    print(
        'compute word overlap scores between target ending and generated ending'
    )
    metrics_dict_target_generated = compute_metrics(
        hypothesis=test_genarated_file, references=[test_ending_file])
    print('-' * 50)
    # print('compute similarity scores between generated ending and target_ending')
    # metrics_dict_plot_generated = compute_metrics(hypothesis=test_genarated_file, references=[test_ending_file],
    #                                               no_overlap=True)
    # print('-' * 50)
    print('compute similarity scores between generated ending and plot')
    metrics_dict_plot_target = compute_metrics(hypothesis=test_genarated_file,
                                               references=[test_plot_file],
                                               no_overlap=True)
    # print('-' * 50)
Пример #27
0
import bert_score
import sys

from nlgeval import compute_metrics

print(sys.argv[1])
fn = sys.argv[1]

gt = [
    e.replace("_go", "").replace("_eos", "").strip()
    for e in open("ap_data/valid_freq.tgt").readlines()
]
pred = [
    e.replace("_go", "").replace("_eos", "").strip()
    for e in open(fn).readlines()
]

open("mt_tmp/pred.txt", "w+").writelines([e + "\n" for e in pred])
open("mt_tmp/gt.txt", "w+").writelines([e + "\n" for e in gt])
results = compute_metrics(hypothesis="mt_tmp/pred.txt",
                          references=["mt_tmp/gt.txt"],
                          no_skipthoughts=True,
                          no_glove=True)
open("{0}.meteor".format(fn.split(".")[0]), "w+").write(str(results["METEOR"]))

pred = bert_score.score(pred, gt, device='gpu', model_type='roberta-base')
scores = pred[-1].tolist()
open("{0}.scores".format(fn.split(".")[0]), "w+").write(str(scores))
Пример #28
0
    predicted.append(yhat.split())


def save_to_file(path, file):
    fh = open(path, 'a')
    fh.write(file)
    fh.close()


for i in range(len(predicted)):
    str = ""
    for j in range(len(predicted[i])):
        str = str + predicted[i][j] + " "
    str = str[:-1]
    str = str + "\n"
    save_to_file("/rap_blues/lunwen/paras#/4/predicted.txt", str)

#prediction value
from nlgeval import compute_metrics

compute_metrics(
    hypothesis='H:/deep_front/nlg-eval-master/test_comparison/predicted.txt',
    references=[
        'H:/deep_front/nlg-eval-master/test_comparison/test_labels2.txt'
    ])
metrics_dict = compute_metrics(
    hypothesis='H:/deep_front/nlg-eval-master/test_comparison/predicted.txt',
    references=[
        'H:/deep_front/nlg-eval-master/test_comparison/test_labels2.txt'
    ])
 def __init__(self, hypothesis_file, reference_file):
     self.metrics_dict = nlgeval.compute_metrics(
         hypothesis=hypothesis_file,
         references=[reference_file],
         no_skipthoughts=True,
         no_glove=True)
Пример #30
0
from nlgeval import compute_metrics
import csv
import sys

if len(sys.argv) < 2:
    print("Please provide the result_xxx.csv file as an argument.")
else:
    res_file = sys.argv[1]
    file_name = res_file.split('.')[0]
    ref_file = file_name + '_ref.txt'
    hyp_file = file_name + '_hyp.txt'

    fout_1 = open(ref_file, 'w+')
    fout_2 = open(hyp_file, 'w+')
    with open(res_file, 'rU') as f:  #U
        for row in csv.reader(f, delimiter=';'):
            reference = row[0]
            hypothesis = row[1]
            fout_1.write(reference + '\n')
            fout_2.write(hypothesis + '\n')
    f.close()
    fout_1.close()
    fout_2.close()
    metrics_dict = compute_metrics(hypothesis=hyp_file, references=[ref_file])
    print(metrics_dict)