def fast_rouge(self, step):
        self.logger.info("Calculating Rouge")

        gold_path = self.args.result_path + '.%d.gold' % step
        can_path = self.args.result_path + '.%d.candidate' % step

        if self.args.dataset in ["DUC2006", "DUC2007"]:
            ## give only one reference
            data = []
            with open(gold_path, 'r') as f:
                for line in f.read().splitlines():
                    data.append(line.strip())

            data = [d.split("<x>")[0].strip() for d in data]
            with open(gold_path, 'w') as f:
                f.write("\n".join(data))
                f.flush()

            print(8 * "=" + "DEBUG TEST FOR DUC" + 8 * "=")
            print(f"reference sample: {data[0]}")

        files_rouge = FilesRouge(can_path, gold_path)
        scores = files_rouge.get_scores(avg=True)

        rouges = {}
        rouges["rouge_l_f_score"] = scores["rouge-l"]["f"]
        rouges["rouge_2_f_score"] = scores["rouge-2"]["f"]
        rouges["rouge_1_f_score"] = scores["rouge-1"]["f"]
        self.logger.info(rouges)

        return rouges
示例#2
0
def print_out_rouge_score(predicted_path, expected_path):

    files_rouge = FilesRouge()
    scores = files_rouge.get_scores(predicted_path, expected_path, avg=True)
    print("ROUGE scores ", scores)

    return 0
示例#3
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='Rouge Metric Calculator')
    parser.add_argument('-f', '--file', help="File mode", action='store_true')
    parser.add_argument('-a',
                        '--avg',
                        help="Average mode",
                        action='store_true')
    parser.add_argument('hypothesis', type=str, help='Text of file path')
    parser.add_argument('reference', type=str, help='Text or file path')

    args = parser.parse_args()
    if args.file:
        hyp, ref = args.hypothesis, args.reference
        assert (os.path.isfile(hyp))
        assert (os.path.isfile(ref))

        files_rouge = FilesRouge(hyp, ref)
        scores = files_rouge.get_scores(avg=args.avg)

        print(json.dumps(scores, indent=2))
    else:
        hyp, ref = args.hypothesis, args.reference
        assert (type(hyp) == str)
        assert (type(ref) == str)

        rouge = Rouge()
        scores = rouge.get_scores(hyp, ref, avg=args.avg)

        print(json.dumps(scores, indent=2))
示例#4
0
    def validate(self, data_iter, step, attn_debug=False):

        self.model.eval()
        gold_path = self.args.result_path + 'step.%d.gold_temp' % step
        pred_path = self.args.result_path + 'step.%d.pred_temp' % step
        gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
        pred_out_file = codecs.open(pred_path, 'w', 'utf-8')

        ct = 0
        ext_acc_num = 0
        ext_pred_num = 0
        ext_gold_num = 0

        with torch.no_grad():
            for batch in data_iter:
                output_data, tgt_data, ext_pred, ext_gold = self.translate_batch(
                    batch)
                translations = self.from_batch_dev(output_data, tgt_data)

                for idx in range(len(translations)):
                    if ct % 100 == 0:
                        print("Processing %d" % ct)
                    pred_summ, gold_data = translations[idx]
                    # ext f1 calculate
                    acc_num = len(ext_pred[idx] + ext_gold[idx]) - len(
                        set(ext_pred[idx] + ext_gold[idx]))
                    pred_num = len(ext_pred[idx])
                    gold_num = len(ext_gold[idx])
                    ext_acc_num += acc_num
                    ext_pred_num += pred_num
                    ext_gold_num += gold_num
                    pred_out_file.write(pred_summ + '\n')
                    gold_out_file.write(gold_data + '\n')
                    ct += 1
                pred_out_file.flush()
                gold_out_file.flush()

        pred_out_file.close()
        gold_out_file.close()

        if (step != -1):
            pred_bleu = test_bleu(pred_path, gold_path)
            file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path)
            pred_rouges = file_rouge.get_scores(avg=True)
            f1, p, r = test_f1(ext_acc_num, ext_pred_num, ext_gold_num)
            self.logger.info(
                'Ext Sent Score at step %d: \n>> P/R/F1: %.2f/%.2f/%.2f' %
                (step, p * 100, r * 100, f1 * 100))
            self.logger.info(
                'Gold Length at step %d: %.2f' %
                (step, test_length(gold_path, gold_path, ratio=False)))
            self.logger.info('Prediction Length ratio at step %d: %.2f' %
                             (step, test_length(pred_path, gold_path)))
            self.logger.info('Prediction Bleu at step %d: %.2f' %
                             (step, pred_bleu * 100))
            self.logger.info('Prediction Rouges at step %d: \n%s\n' %
                             (step, rouge_results_to_str(pred_rouges)))
            rouge_results = (pred_rouges["rouge-1"]['f'],
                             pred_rouges["rouge-l"]['f'])
        return rouge_results
    def compute_metrics(pred):
        labels = output_dir + '/labels.txt'
        preds = output_dir + '/preds.txt'
        lf = open(labels, 'w')
        pf = open(preds, 'w')
        labels_list = []
        preds_list = []

        for label, pred in zip(pred.label_ids, pred.predictions):
            de_label = tokenizer.decode(label)
            de_pred = tokenizer.decode(pred.argmax(-1))
            lf.write(de_label + '\n')
            pf.write(de_pred + '\n')
            labels_list.append([de_label])
            preds_list.append(de_pred)

        lf.close()
        pf.close()

        bleu_score = corpus_bleu(labels_list, preds_list)
        files_rouge = FilesRouge()
        rouge_score = files_rouge.get_scores(preds, labels, avg=True)

        return {
            'bleu score: ': bleu_score * 100,
            'rouge-l: ': rouge_score['rouge-l']['f'] * 100
        }
def using_155_processed_data(input_dir):
    fpout_ref = open("tmp.ref", "w")
    fpout_hyp = open("tmp.hyp", "w")
    for i in range(0, 50):
        for line in open(input_dir + "/model/ref." + str(i) + ".txt"):
            line = line.strip()
            if line.startswith("<a name=\"1\">[1]</a> <a href=\"#1\" id=1>"):
                line = line.replace(
                    "<a name=\"1\">[1]</a> <a href=\"#1\" id=1>",
                    "").replace("</a>", "")
                fpout_ref.write(line + "\n")
        for line in open(input_dir + "system/cand." + str(i) + ".txt"):
            line = line.strip()
            if line.startswith("<a name=\"1\">[1]</a> <a href=\"#1\" id=1>"):
                line = line.replace(
                    "<a name=\"1\">[1]</a> <a href=\"#1\" id=1>",
                    "").replace("</a>", "")
                fpout_hyp.write(line + "\n")
    fpout_ref.close()
    fpout_hyp.close()

    # get rouge
    files_rouge = FilesRouge()
    scores = files_rouge.get_scores("tmp.hyp", "tmp.ref", avg=True)
    for item in scores:
        print(item, scores[item])
def getRouge(hyp_path, ref_path):
    files_rouge = FilesRouge()
    rouge = Rouge()
    #files_rouge = Rouge155()
    # or
    #scores = rouge.get_scores(hyp_path,ref_path,avg=True)
    scores = files_rouge.get_scores(hyp_path, ref_path, avg=True)
    pprint(scores)
示例#8
0
def compare_summaries():
    """
    Compares the src summaries with the tgt summaries and prints the ROUGE scores.
    """
    files_rouge = FilesRouge('summaries.src.txt', 'summaries.tgt.txt')
    rouge_scores = files_rouge.get_scores(avg=True)

    print_rouge_scores(rouge_scores)
示例#9
0
def main():
    parser = argparse.ArgumentParser(description='Rouge Metric Calculator')
    parser.add_argument('-f', '--file', help="File mode", action='store_true')
    parser.add_argument('-a',
                        '--avg',
                        help="Average mode",
                        action='store_true')
    parser.add_argument('--ignore_empty',
                        action='store_true',
                        help="Ignore empty hypothesis")
    parser.add_argument('hypothesis', type=str, help='Text of file path')
    parser.add_argument('reference', type=str, help='Text or file path')
    parser.add_argument("--metrics",
                        nargs="+",
                        type=str.upper,
                        choices=METRICS_CHOICES.keys(),
                        help="Metrics to use (default=all)")
    parser.add_argument("--stats",
                        nargs="+",
                        type=str.upper,
                        choices=STATS_CHOICES,
                        help="Stats to use (default=all)")

    args = parser.parse_args()

    metrics = args.metrics
    stats = args.stats

    if metrics is not None:
        metrics = [METRICS_CHOICES[m] for m in args.metrics]

    if args.file:
        hyp, ref = args.hypothesis, args.reference
        assert (os.path.isfile(hyp))
        assert (os.path.isfile(ref))

        files_rouge = FilesRouge(metrics, stats)
        scores = files_rouge.get_scores(hyp,
                                        ref,
                                        avg=args.avg,
                                        ignore_empty=args.ignore_empty)

        print(json.dumps(scores, indent=2))
    else:
        hyp, ref = args.hypothesis, args.reference
        assert (type(hyp) == str)
        assert (type(ref) == str)

        rouge = Rouge(metrics, stats)
        scores = rouge.get_scores(hyp, ref, avg=args.avg)

        print(json.dumps(scores, indent=2))
def get_rouge(path):
    hyp_names = sorted(os.listdir(path), key=lambda x: int(x[x.find('_') + 1: x.find('.txt')]))
    ref_names = sorted(os.listdir(ref_path), key=lambda x: int(x[x.find('_') + 1: x.find('.txt')]))
    all_scores = []
    for hyp_name, ref_name in zip(hyp_names, ref_names):
        files_rouge = FilesRouge(os.path.join(path, hyp_name), os.path.join(ref_path, ref_name))
        scores = files_rouge.get_scores()
        avg_score = {'rouge-1': 0, 'rouge-2': 0, 'rouge-l': 0}
        for x in avg_score:
            avg_score[x] = {'p': sum([score[x]['p'] for score in scores]) / len(scores),
                            'r': sum([score[x]['r'] for score in scores]) / len(scores),
                            'f': sum([score[x]['f'] for score in scores]) / len(scores)}
        all_scores.append(avg_score)
    return all_scores
示例#11
0
  def call(self, y_true=None, y_pred=None, arguments=None):
    ref_sents = []
    for tgt_path in self.tgt_paths_after_pre_process:
      with open(tgt_path, "r", encoding='utf8') as tgt_f:
        ref_sents.extend(tgt_f.readlines())
    ref_sents = [sent.strip() for sent in ref_sents]

    with open(self.ref_path, "w", encoding="utf-8") as in_f:
      for ref_sent in ref_sents:
        in_f.write(ref_sent)
        in_f.write("\n")

    files_rouge = FilesRouge()
    scores = files_rouge.get_scores(self.hyp_path, self.ref_path, avg=True)
    return self.get_scores_output(scores)
示例#12
0
def cal_rouge(log_path, print_log):
    ref_path = log_path + 'reference.txt'
    cand_path = log_path + 'candidate.txt'
    scores = FilesRouge(ref_path=ref_path,
                        hyp_path=cand_path).get_scores(avg=True)
    recall = [
        round(scores['rouge-1']['r'] * 100, 2),
        round(scores['rouge-2']['r'] * 100, 2),
        round(scores['rouge-l']['r'] * 100, 2)
    ]

    precision = [
        round(scores['rouge-1']['p'] * 100, 2),
        round(scores['rouge-2']['p'] * 100, 2),
        round(scores['rouge-l']['p'] * 100, 2)
    ]

    f_score = [
        round(scores['rouge-1']['f'] * 100, 2),
        round(scores['rouge-2']['f'] * 100, 2),
        round(scores['rouge-2']['f'] * 100, 2)
    ]
    print_log("F_measure: %s Recall: %s Precision: %s \n" %
              (str(f_score), str(recall), str(precision)))
    return f_score[:], recall[:], precision[:]
示例#13
0
    def validate(self, data_iter, step, attn_debug=False):

        self.model.eval()
        gold_path = self.args.result_path + '.step.%d.gold_temp' % step
        pred_path = self.args.result_path + '.step.%d.pred_temp' % step
        gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
        pred_out_file = codecs.open(pred_path, 'w', 'utf-8')

        # pred_results, gold_results = [], []
        ct = 0
        with torch.no_grad():
            for batch in data_iter:
                doc_data, summ_data = self.translate_batch(batch)
                translations = self.from_batch_dev(batch, doc_data)

                for idx in range(len(translations)):
                    if ct % 100 == 0:
                        print("Processing %d" % ct)
                    doc_short_context = translations[idx][1]
                    gold_data = summ_data[idx]
                    pred_out_file.write(doc_short_context + '\n')
                    gold_out_file.write(gold_data + '\n')
                    ct += 1
                pred_out_file.flush()
                gold_out_file.flush()

        pred_out_file.close()
        gold_out_file.close()

        if (step != -1):
            pred_bleu = test_bleu(pred_path, gold_path)
            file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path)
            pred_rouges = file_rouge.get_scores(avg=True)
            self.logger.info(
                'Gold Length at step %d: %.2f' %
                (step, test_length(gold_path, gold_path, ratio=False)))
            self.logger.info('Prediction Length ratio at step %d: %.2f' %
                             (step, test_length(pred_path, gold_path)))
            self.logger.info('Prediction Bleu at step %d: %.2f' %
                             (step, pred_bleu * 100))
            self.logger.info('Prediction Rouges at step %d: \n%s\n' %
                             (step, rouge_results_to_str(pred_rouges)))
            rouge_results = (pred_rouges["rouge-1"]['f'],
                             pred_rouges["rouge-l"]['f'])
        return rouge_results
        def calculate_scores():
            hyp_fn, ref_fn = 'tmp.%s.src' % mode, 'tmp.%s.tgt' % mode
            write_token_id_arrays_to_text_file(hypotheses,
                                               os.path.join(model_dir, hyp_fn),
                                               tokenizer)
            write_token_id_arrays_to_text_file(references,
                                               os.path.join(model_dir, ref_fn),
                                               tokenizer)

            hyp_fn, ref_fn = os.path.join(model_dir, hyp_fn), os.path.join(
                model_dir, ref_fn)

            files_rouge = FilesRouge(hyp_fn, ref_fn)
            rouge_scores = files_rouge.get_scores(avg=True)

            bleu_score = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)

            return rouge_scores, bleu_score
示例#15
0
def evaluate(hyp, ref):
    with open(hyp, 'r') as r:
        hypothesis = r.readlines()
        res = {k: [" ".join(v.strip().lower().split())] for k, v in enumerate(hypothesis)}
    with open(ref, 'r') as r:
        references = r.readlines()
        gts = {k: [v.strip().lower()] for k, v in enumerate(references)}
    score_Bleu , stderr = Bleu().compute_score(hyp, ref)
    print("Bleu_4: " + str(score_Bleu))

    score_Meteor, scores_Meteor = Meteor().compute_score(gts, res)
    print("Meteor: "), score_Meteor

    files_rouge = FilesRouge(hyp, ref)
    scores = files_rouge.get_scores(avg=True)
    print('Rouge: ' + str(scores))
    
    score_Cider, scores_Cider = Cider().compute_score(gts, res)
    print("Cider: "), score_Cider
示例#16
0
def _report_rouge(result_file, golden_file):
    "1"
    #import sys
    #sys.path.append('../Lib/site-packages/rouge')
    #from rouge_wrap import RougeWrapper
    #r=RougeWrapper()
    #results = r.evaluate_for_pair_files(golden_file, result_file)
    "2"
    from rouge import Rouge
    #with open("pred.txt", "r") as f1, open("data/task1_ref0.txt", "r") as f2:
    #    rouge = Rouge()
    #    for l1, l2 in zip(f1, f2):
    #        scores = rouge.get_scores(l1, l2, avg=True)
    #print(scores)
    "3"
    from rouge import FilesRouge
    r = FilesRouge(result_file, golden_file)
    results = r.get_scores(avg=True)
    for k, v in results.items():
        if 'f' not in v:
            continue
        print(k, v)
def get_file_rouge(ref_dir, hyp_dir):
    ref_map = {}
    hyp_map = {}
    for filename in os.listdir(ref_dir):
        file_id = filename.split(".")[0]
        with open(ref_dir + filename, 'r') as file:
            ref_map[file_id] = file.read().strip()
    for filename in os.listdir(hyp_dir):
        file_id = filename.split(".")[0]
        with open(hyp_dir + filename, 'r') as file:
            hyp_map[file_id] = file.read().strip()
    fpout_ref = open("tmp.ref", "w")
    fpout_hyp = open("tmp.hyp", "w")
    for file_id in ref_map:
        fpout_ref.write(ref_map[file_id] + "\n")
        fpout_hyp.write(hyp_map[file_id] + "\n")
    fpout_ref.close()
    fpout_hyp.close()

    # get rouge
    files_rouge = FilesRouge()
    scores = files_rouge.get_scores("tmp.hyp", "tmp.ref", avg=True)
    for item in scores:
        print(item, scores[item])
示例#18
0
    def translate(self, data_iter, step, attn_debug=False):

        self.model.eval()
        output_path = self.args.result_path + '.%d.output' % step
        output_file = codecs.open(output_path, 'w', 'utf-8')
        gold_path = self.args.result_path + '.%d.gold_test' % step
        pred_path = self.args.result_path + '.%d.pred_test' % step
        gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
        pred_out_file = codecs.open(pred_path, 'w', 'utf-8')
        # pred_results, gold_results = [], []

        ct = 0
        ext_acc_num = 0
        ext_pred_num = 0
        ext_gold_num = 0

        with torch.no_grad():
            rouge = Rouge()
            for batch in data_iter:
                output_data, tgt_data, ext_pred, ext_gold = self.translate_batch(
                    batch)
                translations = self.from_batch_test(batch, output_data,
                                                    tgt_data)

                for idx in range(len(translations)):
                    origin_sent, pred_summ, gold_data = translations[idx]
                    if ct % 100 == 0:
                        print("Processing %d" % ct)
                    output_file.write("ID      : %d\n" % ct)
                    output_file.write("ORIGIN  : \n    " +
                                      origin_sent.replace('<S>', '\n    ') +
                                      "\n")
                    output_file.write("GOLD    : " + gold_data.strip() + "\n")
                    output_file.write("DOC_GEN : " + pred_summ.strip() + "\n")
                    rouge_score = rouge.get_scores(pred_summ, gold_data)
                    bleu_score = sentence_bleu(
                        [gold_data.split()],
                        pred_summ.split(),
                        smoothing_function=SmoothingFunction().method1)
                    output_file.write(
                        "DOC_GEN  bleu & rouge-f 1/2/l:    %.4f & %.4f/%.4f/%.4f\n"
                        % (bleu_score, rouge_score[0]["rouge-1"]["f"],
                           rouge_score[0]["rouge-2"]["f"],
                           rouge_score[0]["rouge-l"]["f"]))
                    # ext f1 calculate
                    acc_num = len(ext_pred[idx] + ext_gold[idx]) - len(
                        set(ext_pred[idx] + ext_gold[idx]))
                    pred_num = len(ext_pred[idx])
                    gold_num = len(ext_gold[idx])
                    ext_acc_num += acc_num
                    ext_pred_num += pred_num
                    ext_gold_num += gold_num
                    f1, p, r = test_f1(acc_num, pred_num, gold_num)
                    output_file.write(
                        "EXT_GOLD: [" +
                        ','.join([str(i)
                                  for i in sorted(ext_gold[idx])]) + "]\n")
                    output_file.write(
                        "EXT_PRED: [" +
                        ','.join([str(i)
                                  for i in sorted(ext_pred[idx])]) + "]\n")
                    output_file.write(
                        "EXT_SCORE  P/R/F1:    %.4f/%.4f/%.4f\n\n" %
                        (p, r, f1))
                    pred_out_file.write(pred_summ.strip() + '\n')
                    gold_out_file.write(gold_data.strip() + '\n')
                    ct += 1
                pred_out_file.flush()
                gold_out_file.flush()
                output_file.flush()

        pred_out_file.close()
        gold_out_file.close()
        output_file.close()

        if (step != -1):
            pred_bleu = test_bleu(pred_path, gold_path)
            file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path)
            pred_rouges = file_rouge.get_scores(avg=True)
            f1, p, r = test_f1(ext_acc_num, ext_pred_num, ext_gold_num)
            self.logger.info(
                'Ext Sent Score at step %d: \n>> P/R/F1: %.2f/%.2f/%.2f' %
                (step, p * 100, r * 100, f1 * 100))
            self.logger.info(
                'Gold Length at step %d: %.2f' %
                (step, test_length(gold_path, gold_path, ratio=False)))
            self.logger.info('Prediction Length ratio at step %d: %.2f' %
                             (step, test_length(pred_path, gold_path)))
            self.logger.info('Prediction Bleu at step %d: %.2f' %
                             (step, pred_bleu * 100))
            self.logger.info('Prediction Rouges at step %d: \n%s' %
                             (step, rouge_results_to_str(pred_rouges)))
示例#19
0
    def translate(self, data_iter, step, attn_debug=False):

        self.model.eval()
        output_path = self.args.result_path + '.%d.output' % step
        output_file = codecs.open(output_path, 'w', 'utf-8')
        gold_path = self.args.result_path + '.%d.gold_test' % step
        pred_path = self.args.result_path + '.%d.pred_test' % step
        ex_single_path = self.args.result_path + '.%d.ex_test' % step + ".short"
        ex_context_path = self.args.result_path + '.%d.ex_test' % step + ".long"
        gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
        pred_out_file = codecs.open(pred_path, 'w', 'utf-8')
        short_ex_out_file = codecs.open(ex_single_path, 'w', 'utf-8')
        long_ex_out_file = codecs.open(ex_context_path, 'w', 'utf-8')
        # pred_results, gold_results = [], []

        ct = 0
        with torch.no_grad():
            rouge = Rouge()
            for batch in data_iter:
                doc_data, summ_data = self.translate_batch(batch)
                translations = self.from_batch_test(batch, doc_data)

                for idx in range(len(translations)):
                    origin_sent, doc_extract, context_doc_extract, \
                        doc_pred, lead = translations[idx]
                    if ct % 100 == 0:
                        print("Processing %d" % ct)
                    output_file.write("ID      : %d\n" % ct)
                    output_file.write(
                        "ORIGIN  : " +
                        origin_sent.replace('<S>', '\n          ') + "\n")
                    gold_data = summ_data[idx]
                    output_file.write("GOLD    : " + gold_data + "\n")
                    output_file.write("LEAD    : " + lead + "\n")
                    output_file.write("DOC_EX  : " + doc_extract.strip() +
                                      "\n")
                    output_file.write("DOC_CONT: " +
                                      context_doc_extract.strip() + "\n")
                    output_file.write("DOC_GEN : " + doc_pred.strip() + "\n")

                    gold_list = gold_data.strip().split()
                    lead_list = lead.strip().replace("[unused2]", "").replace(
                        "[unused3]", "").split()
                    rouge_score = rouge.get_scores(lead, gold_data)
                    bleu_score = sentence_bleu(
                        [gold_list],
                        lead_list,
                        smoothing_function=SmoothingFunction().method1)
                    output_file.write(
                        "LEAD     bleu & rouge-f 1/2/l:    %.4f & %.4f/%.4f/%.4f\n"
                        % (bleu_score, rouge_score[0]["rouge-1"]["f"],
                           rouge_score[0]["rouge-2"]["f"],
                           rouge_score[0]["rouge-l"]["f"]))

                    doc_extract_list = doc_extract.strip().replace(
                        "[unused2]", "").replace("[unused3]", "").split()
                    rouge_score = rouge.get_scores(doc_extract, gold_data)
                    bleu_score = sentence_bleu(
                        [gold_list],
                        doc_extract_list,
                        smoothing_function=SmoothingFunction().method1)
                    output_file.write(
                        "DOC_EX   bleu & rouge-f 1/2/l:    %.4f & %.4f/%.4f/%.4f\n"
                        % (bleu_score, rouge_score[0]["rouge-1"]["f"],
                           rouge_score[0]["rouge-2"]["f"],
                           rouge_score[0]["rouge-l"]["f"]))

                    doc_context_list = context_doc_extract.strip().replace(
                        "[unused2]", "").replace("[unused3]", "").split()
                    rouge_score = rouge.get_scores(context_doc_extract,
                                                   gold_data)
                    bleu_score = sentence_bleu(
                        [gold_list],
                        doc_context_list,
                        smoothing_function=SmoothingFunction().method1)
                    output_file.write(
                        "DOC_CONT bleu & rouge-f 1/2/l:    %.4f & %.4f/%.4f/%.4f\n"
                        % (bleu_score, rouge_score[0]["rouge-1"]["f"],
                           rouge_score[0]["rouge-2"]["f"],
                           rouge_score[0]["rouge-l"]["f"]))

                    doc_long_list = doc_pred.strip().replace(
                        "[unused2]", "").replace("[unused3]", "").split()
                    rouge_score = rouge.get_scores(doc_pred, gold_data)
                    bleu_score = sentence_bleu(
                        [gold_list],
                        doc_long_list,
                        smoothing_function=SmoothingFunction().method1)
                    output_file.write(
                        "DOC_GEN  bleu & rouge-f 1/2/l:    %.4f & %.4f/%.4f/%.4f\n\n"
                        % (bleu_score, rouge_score[0]["rouge-1"]["f"],
                           rouge_score[0]["rouge-2"]["f"],
                           rouge_score[0]["rouge-l"]["f"]))

                    short_ex_out_file.write(doc_extract.strip().replace(
                        "[unused2]", "").replace("[unused3]", "") + '\n')
                    long_ex_out_file.write(context_doc_extract.strip().replace(
                        "[unused2]", "").replace("[unused3]", "") + '\n')
                    pred_out_file.write(doc_pred.strip().replace(
                        "[unused2]", "").replace("[unused3]", "") + '\n')
                    gold_out_file.write(gold_data.strip() + '\n')
                    ct += 1
                pred_out_file.flush()
                short_ex_out_file.flush()
                long_ex_out_file.flush()
                gold_out_file.flush()
                output_file.flush()

        pred_out_file.close()
        short_ex_out_file.close()
        long_ex_out_file.close()
        gold_out_file.close()
        output_file.close()

        if (step != -1):
            ex_short_bleu = test_bleu(gold_path, ex_single_path)
            ex_long_bleu = test_bleu(gold_path, ex_context_path)
            pred_bleu = test_bleu(gold_path, pred_path)

            file_rouge = FilesRouge(hyp_path=ex_single_path,
                                    ref_path=gold_path)
            ex_short_rouges = file_rouge.get_scores(avg=True)

            file_rouge = FilesRouge(hyp_path=ex_context_path,
                                    ref_path=gold_path)
            ex_long_rouges = file_rouge.get_scores(avg=True)

            file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path)
            pred_rouges = file_rouge.get_scores(avg=True)

            self.logger.info(
                'Gold Length at step %d: %.2f\n' %
                (step, test_length(gold_path, gold_path, ratio=False)))
            self.logger.info('Short Extraction Length ratio at step %d: %.2f' %
                             (step, test_length(ex_single_path, gold_path)))
            self.logger.info('Short Extraction Bleu at step %d: %.2f' %
                             (step, ex_short_bleu * 100))
            self.logger.info('Short Extraction Rouges at step %d \n%s' %
                             (step, rouge_results_to_str(ex_short_rouges)))
            self.logger.info('Long Extraction Length ratio at step %d: %.2f' %
                             (step, test_length(ex_context_path, gold_path)))
            self.logger.info('Long Extraction Bleu at step %d: %.2f' %
                             (step, ex_long_bleu * 100))
            self.logger.info('Long Extraction Rouges at step %d \n%s' %
                             (step, rouge_results_to_str(ex_long_rouges)))
            self.logger.info('Prediction Length ratio at step %d: %.2f' %
                             (step, test_length(pred_path, gold_path)))
            self.logger.info('Prediction Bleu at step %d: %.2f' %
                             (step, pred_bleu * 100))
            self.logger.info('Prediction Rouges at step %d \n%s' %
                             (step, rouge_results_to_str(pred_rouges)))
示例#20
0
from rouge import FilesRouge
import sys


def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(
        metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


#Script arguments
REFERENCE_PATH = sys.argv[1]
HYPOTHESIS_PATH = sys.argv[2]

files_rouge = FilesRouge(HYPOTHESIS_PATH, REFERENCE_PATH)

scores = files_rouge.get_scores(avg=True)

for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))
    print()

#print(scores)
示例#21
0
    def evaluate(self, release=False):
        eval_start_time = time.time()
        if self.eval_queue is None:
            self.eval_queue = androidreader.Reader(
                subtoken_to_index=self.subtoken_to_index,
                node_to_index=self.node_to_index,
                target_to_index=self.target_to_index,
                config=self.config,
                is_evaluating=True)
            reader_output = self.eval_queue.get_output()
            self.eval_predicted_indices_op, self.eval_topk_values, _, _, self.method_embedding = \
                self.build_test_graph(reader_output)
            self.eval_true_target_strings_op = reader_output[
                androidreader.TARGET_STRING_KEY]
            self.eval_tag_key_op = reader_output[androidreader.TARGET_TAG_KEY]
            self.saver = tf.train.Saver(max_to_keep=10)

        if self.config.LOAD_PATH and not self.config.TRAIN_PATH:
            self.initialize_session_variables(self.sess)
            self.load_model(self.sess)
            if release:
                release_name = self.config.LOAD_PATH + '.release'
                print('Releasing model, output model: %s' % release_name)
                self.saver.save(self.sess, release_name)
                shutil.copyfile(src=self.config.LOAD_PATH + '.dict',
                                dst=release_name + '.dict')
                return None
        model_dirname = os.path.dirname(self.config.SAVE_PATH if self.config.
                                        SAVE_PATH else self.config.LOAD_PATH)
        ref_file_name = model_dirname + '/ref.txt'
        predicted_file_name = model_dirname + '/pred.txt'
        embedding_file_name = model_dirname + '/embedding.txt'
        if not os.path.exists(model_dirname):
            os.makedirs(model_dirname)

        # print("itern decoder size is " + str(self.config.DECODER_SIZE))
        with open(model_dirname + '/log.txt', 'w') as output_file, \
            open(ref_file_name, 'w') as ref_file, \
            open( predicted_file_name, 'w') as pred_file, \
            open( embedding_file_name, "w") as embedding_file:
            num_correct_predictions = 0 if self.config.BEAM_WIDTH == 0 \
                else np.zeros([self.config.BEAM_WIDTH], dtype=np.int32)
            total_predictions = 0
            total_prediction_batches = 0
            true_positive, false_positive, false_negative = 0, 0, 0
            self.eval_queue.reset(self.sess)
            start_time = time.time()

            try:
                while True:
                    predicted_indices, true_target_strings, top_values, method_embeddings, tag = self.sess.run(
                        [
                            self.eval_predicted_indices_op,
                            self.eval_true_target_strings_op,
                            self.eval_topk_values, self.method_embedding,
                            self.eval_tag_key_op
                        ], )
                    #print( tag.shape )
                    #print( tag[0])
                    #                    print( method_embeddings.shape )
                    #                    print( "0,0 " +  str(method_embeddings[0,0]))
                    #                    print( "MAX_LINE,MAX_COLUMN " + str(method_embeddings[ method_embeddings.shape[0] - 1 , method_embeddings.shape[1] - 1]))
                    #print( true_target_strings )
                    true_target_strings = Common.binary_to_string_list(
                        true_target_strings)
                    ref_file.write('\n'.join([
                        name.replace(Common.internal_delimiter, ' ')
                        for name in true_target_strings
                    ]) + '\n')
                    if self.config.BEAM_WIDTH > 0:
                        # predicted indices: (batch, time, beam_width)
                        predicted_strings = [[[
                            self.index_to_target[i] for i in timestep
                        ] for timestep in example]
                                             for example in predicted_indices]
                        predicted_strings = [
                            list(map(list, zip(*example)))
                            for example in predicted_strings
                        ]  # (batch, top-k, target_length)
                        pred_file.write('\n'.join([
                            ' '.join(Common.filter_impossible_names(words))
                            for words in predicted_strings[0]
                        ]) + '\n')
                    else:
                        predicted_strings = [[
                            self.index_to_target[i] for i in example
                        ] for example in predicted_indices]
                        pred_file.write('\n'.join([
                            ' '.join(Common.filter_impossible_names(words))
                            for words in predicted_strings
                        ]) + '\n')

                    num_correct_predictions = self.update_correct_predictions(
                        num_correct_predictions, output_file,
                        zip(true_target_strings, predicted_strings))
                    true_positive, false_positive, false_negative = self.update_per_subtoken_statistics(
                        zip(true_target_strings, predicted_strings),
                        true_positive, false_positive, false_negative)

                    total_predictions += len(true_target_strings)
                    total_prediction_batches += 1
                    if total_prediction_batches % self.num_batches_to_log == 0:
                        elapsed = time.time() - start_time
                        self.trace_evaluation(output_file,
                                              num_correct_predictions,
                                              total_predictions, elapsed)
                    embedding_file.write('\n'.join([
                        Common.binary_to_string(tag[i]) + ',' + ','.join([
                            str(method_embeddings[i, j])
                            for j in range(method_embeddings.shape[1])
                        ]) for i in range(method_embeddings.shape[0])
                    ]) + '\n')
            except tf.errors.OutOfRangeError:
                pass

            print('Done testing, epoch reached')
            output_file.write(
                str(num_correct_predictions / total_predictions) + '\n')
            # Common.compute_bleu(ref_file_name, predicted_file_name)

        elapsed = int(time.time() - eval_start_time)
        precision, recall, f1 = self.calculate_results(true_positive,
                                                       false_positive,
                                                       false_negative)
        try:
            files_rouge = FilesRouge()
            rouge = files_rouge.get_scores(hyp_path=predicted_file_name,
                                           ref_path=ref_file_name,
                                           avg=True,
                                           ignore_empty=True)
        except ValueError:
            rouge = 0
        print("Evaluation time: %sh%sm%ss" %
              ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
        return num_correct_predictions / total_predictions, \
               precision, recall, f1, rouge
示例#22
0
 def __call__(self, ref_path, hyp_path):
     from rouge import FilesRouge  # pylint: disable=import-outside-toplevel
     files_rouge = FilesRouge(hyp_path, ref_path)
     rouge_scores = files_rouge.get_scores(avg=True)
     return {name: rouge_scores[name]["f"] for name in self.scores_name}
示例#23
0
## Output (avg=True): a single dict with average values:
scores = rouge.get_scores(hyps, refs, avg=True)
pprint(scores)
# {'rouge-1': {'f': 0.41111110617777785,
#              'p': 0.4444444444444444,
#              'r': 0.38888888888888884},
#  'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
#  'rouge-l': {'f': 0.40158730158694217,
#              'p': 0.4444444444444444,
#              'r': 0.38888888888888884}}
print(scores['rouge-l']['f'])  # 0.40158730158694217
##################################################################
## Score two files (line by line)
# Given two files hyp_path, ref_path, with the same number (n) of lines, calculate score for each of this lines, or, the average over the whole file.
from rouge import FilesRouge
files_rouge = FilesRouge(hyp_path, ref_path)
scores = files_rouge.get_scores()

##################################################################
## Rouge for Chinese
from rouge import Rouge
from pprint import pprint
rouge = Rouge()

## for sentence
s_1 = '我是好人'
s_2 = '我是坏人'
score = rouge.get_scores(list(s_1), list(s_2))
print(score)

## for list
示例#24
0
def rouge_score(filename_gold, filename_result):
    files_rouge = FilesRouge(filename_result, filename_gold)
    scores = files_rouge.get_scores(avg=True)
    return scores
示例#25
0
    # model parameters
    parser.add_argument('--vocab_len', type=int, default=12269, help='')
    parser.add_argument('--inredient_dim', type=int, default=3925, help='')
    parser.add_argument('--word_dim', type=int, default=256, help='')
    parser.add_argument('--sentEnd_hiddens', type=int, default=512, help='')
    parser.add_argument('--sentEnd_nlayers', type=int, default=1, help='')
    parser.add_argument('--recipe_inDim', type=int, default=1024, help='')
    parser.add_argument('--recipe_hiddens', type=int, default=1024, help='')
    parser.add_argument('--recipe_nlayers', type=int, default=1, help='')
    parser.add_argument('--sentDec_inDim', type=int, default=1024, help='')
    parser.add_argument('--sentDec_hiddens', type=int, default=512, help='')
    parser.add_argument('--sentDec_nlayers', type=int, default=1, help='')
    parser.add_argument('--sentences_sorted', type=int, default=1, help='')

    args = parser.parse_args()

    # Load dictionary mapping index to vocab string (decoded)
    with open('data/index_to_vocab.json', 'r') as vocabFile:
        index_to_vocab = json.load(vocabFile)

    # Generate results files with one recipe per line
    generate(args, saved_model_folder, epochs, split, results_path,
             index_to_vocab, video)

    # Calculate rouge score
    files_rouge = FilesRouge()
    outputs_path = os.path.join(results_path, 'outputs_{}.txt'.format(split))
    ref_path = os.path.join(results_path, 'gt_{}.txt'.format(split))
    scores = files_rouge.get_scores(outputs_path, ref_path, avg=True)
    print("ROUGE scores ", scores)
示例#26
0
 def score(self, labels_file, predictions_path):
     from rouge import FilesRouge
     files_rouge = FilesRouge(predictions_path, labels_file)
     rouge_scores = files_rouge.get_scores(avg=True)
     return {k: v["f"] for k, v in six.iteritems(rouge_scores)}
示例#27
0
from rouge import FilesRouge
import os
import os.path

hyp_path = "C:\\Users\\Admin\\Desktop\\virtual\\predicted_post.txt"
ref_path = "C:\\Users\\Admin\\Desktop\\virtual\\answer_true_post.txt"

files_rouge = FilesRouge()
scores = files_rouge.get_scores(hyp_path, ref_path, avg=True)
print(scores)
示例#28
0
    def evaluate(self):
        if not self.model:
            print('Model is not initialized')
            exit(-1)

        print("Testing...")
        eval_start_time = time.time()

        if self.config.LOAD_PATH and not self.config.TRAIN_PATH:
            model_dirname = os.path.dirname(self.config.LOAD_PATH)
        elif self.config.MODEL_PATH:
            model_dirname = os.path.dirname(self.config.MODEL_PATH)
        else:
            model_dirname = None
            print('Model directory is mossing')
            exit(-1)

        ref_file_name = os.path.join(model_dirname, 'ref.txt')
        predicted_file_name = os.path.join(model_dirname, 'pred.txt')
        if not os.path.exists(model_dirname):
            os.makedirs(model_dirname)

        log_file_name = os.path.join(model_dirname, 'log.txt')
        with open(log_file_name, 'w') as output_file, open(
                ref_file_name, 'w') as ref_file, open(predicted_file_name,
                                                      'w') as pred_file:
            num_correct_predictions = 0 if self.config.BEAM_WIDTH == 0 \
                else np.zeros([self.config.BEAM_WIDTH], dtype=np.int32)
            total_predictions = 0
            total_prediction_batches = 0
            true_positive, false_positive, false_negative = 0, 0, 0
            dataset = self.test_dataset_reader.get_dataset()
            start_time = time.time()

            for input_tensors in dataset:
                true_target_strings = input_tensors[reader.TARGET_STRING_KEY]

                batched_contexts = self.model.run_encoder(input_tensors,
                                                          is_training=False)
                outputs, final_states = self.model.run_decoder(
                    batched_contexts, input_tensors, is_training=False)

                if self.config.BEAM_WIDTH > 0:
                    predicted_indices = outputs.predicted_ids
                else:
                    predicted_indices = outputs.sample_id

                true_target_strings = Common.binary_to_string_list(
                    true_target_strings.numpy())
                ref_file.write('\n'.join([
                    name.replace(Common.internal_delimiter, ' ')
                    for name in true_target_strings
                ]) + '\n')
                if self.config.BEAM_WIDTH > 0:
                    # predicted indices: (batch, time, beam_width)
                    predicted_strings = [[
                        [self.index_to_target[i] for i in timestep]
                        for timestep in example
                    ] for example in predicted_indices.numpy()]
                    predicted_strings = [
                        list(map(list, zip(*example)))
                        for example in predicted_strings
                    ]  # (batch, top-k, target_length)
                    pred_file.write('\n'.join([
                        ' '.join(Common.filter_impossible_names(words))
                        for words in predicted_strings[0]
                    ]) + '\n')
                else:
                    predicted_strings = [[
                        self.index_to_target[i] for i in example
                    ] for example in predicted_indices.numpy()]
                    pred_file.write('\n'.join([
                        ' '.join(Common.filter_impossible_names(words))
                        for words in predicted_strings
                    ]) + '\n')

                num_correct_predictions = update_correct_predictions(
                    self.config.BEAM_WIDTH, num_correct_predictions,
                    output_file, zip(true_target_strings, predicted_strings))
                true_positive, false_positive, false_negative = update_per_subtoken_statistics(
                    self.config.BEAM_WIDTH,
                    zip(true_target_strings, predicted_strings), true_positive,
                    false_positive, false_negative)

                total_predictions += len(true_target_strings)
                total_prediction_batches += 1
                if total_prediction_batches % self.num_batches_to_log == 0:
                    elapsed = time.time() - start_time
                    trace_evaluation(output_file, num_correct_predictions,
                                     total_predictions, elapsed)

            print('Done testing, epoch reached', flush=True)
            output_file.write(
                str(num_correct_predictions / total_predictions) + '\n')

        elapsed = int(time.time() - eval_start_time)
        precision, recall, f1 = calculate_results(true_positive,
                                                  false_positive,
                                                  false_negative)
        files_rouge = FilesRouge(predicted_file_name, ref_file_name)
        rouge = files_rouge.get_scores(avg=True, ignore_empty=True)
        print("Evaluation time: %sh%sm%ss" %
              ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
        return num_correct_predictions / total_predictions, precision, recall, f1, rouge
示例#29
0
import os
import texar.torch as tx
from rouge import FilesRouge
files_rouge = FilesRouge()

def get_all_files(path):
    if os.path.isfile(path): 
        return [path]
    return os.listdir(path)


def replace_type_by_mention(raw_file_path, type_file_path):
    
    raw_file = open(raw_file_path, 'r')
    type_file = open(type_file_path, 'r')

    def get_hypothesis_list(type_file, raw_file):

        raw_file = raw_file.readlines()
        type_file = type_file.readlines()

        hypothesis_list = []
        for types, texts in zip(type_file, raw_file):
            outs = []
            ents = [ent for ent in types.strip().split()]
            for word in texts.strip().split():
                if word == '<ss>':
                    if ents:
                        outs.append(ents[0])
                        ents = ents[1:]
                    else:
示例#30
0
 def __call__(self, ref_path, hyp_path):
   scorer = FilesRouge(metrics=list(self.scores_name))
   rouge_scores = scorer.get_scores(hyp_path, ref_path, avg=True)
   return {name:rouge_scores[name]["f"] for name in self.scores_name}
示例#31
0
def _compute_file_rouge(ref_path, hyp_path):
    """hyp_path:predict file"""
    files_rouge = FilesRouge(hyp_path, ref_path)
    scores = files_rouge.get_scores(avg=True)
    return scores