예제 #1
0
def CallRougeForOneSum(hyp_sents, ref_sents, _ngram=1, _l=False, _w=False, _su4=False, _stem=True, _stopw=True,
                       _lenlmt=False, _len=200):
    rouge = Pythonrouge(summary_file_exist=False, summary=[hyp_sents], reference=[ref_sents], xml_dir=rouge_xml_dir,
                        n_gram=_ngram, ROUGE_L=_l, ROUGE_W=_w, ROUGE_SU4=_su4,
                        stemming=_stem, stopwords=_stopw, length_limit=_lenlmt, length=_len)
    score = rouge.calc_score()
    return score
예제 #2
0
 def test_rouge_with_length_limit(self):
     data = self.load_test_data()
     rouge = RougeCalculator(stopwords=True, length_limit=50)
     for eval_id in data:
         summaries = data[eval_id]["summaries"]
         references = data[eval_id]["references"]
         for n in [1, 2]:
             for s in summaries:
                 baseline = Pythonrouge(summary_file_exist=False,
                                        summary=[[s]],
                                        reference=[[[r]
                                                    for r in references]],
                                        n_gram=n,
                                        recall_only=False,
                                        length_limit=True,
                                        length=50,
                                        stemming=False,
                                        stopwords=True)
                 b1_v = baseline.calc_score()
                 b2_v = rouge_n(rouge.preprocess(s),
                                [rouge.preprocess(r) for r in references],
                                n, 0.5)
                 v = rouge.rouge_n(s, references, n)
                 self.assertLess(abs(b2_v - v), 1e-5)
                 self.assertLess(abs(b1_v["ROUGE-{}-F".format(n)] - v),
                                 1e-5)  # noqa
def lead3_baseline(path_file):
    with open(path_file, 'r') as fd:
        lines = fd.read().splitlines()
    pred_str_bag, ref_str_bag = [], []
    for l in lines:
        name, doc, abst, span_info, gold = l.split('\t')
        doc = doc.split()
        indices = [i for i, x in enumerate(doc) if x == "@@SS@@"]
        abs_str = abst.replace("@@SS@@", "\n").split("\n")
        abs_str = [x for x in abs_str if len(x) > 1]
        if len(indices) > 2:
            sent1 = ' '.join(doc[:indices[0]]).replace("@@SS@@", "")
            sent2 = ' '.join(doc[indices[0]:indices[1]]).replace("@@SS@@", "")
            sent3 = ' '.join(doc[indices[1]:indices[2]]).replace("@@SS@@", "")
            lead3 = [sent1, sent2, sent3]
        else:
            lead3 = ' '.join(doc).replace("@@SS@@", "\n").split("\n")
            lead3 = [x for x in lead3 if len(x) > 1]
        pred_str_bag.append(lead3)
        ref_str_bag.append([abs_str])
    print('Finish reading')
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=pred_str_bag, reference=ref_str_bag,
                        n_gram=2, ROUGE_SU4=True, ROUGE_L=True, ROUGE_W=True,
                        ROUGE_W_Weight=1.2,
                        recall_only=False, stemming=True, stopwords=False,
                        word_level=True, length_limit=False, length=50,
                        use_cf=False, cf=95, scoring_formula='average',
                        resampling=True, samples=1000, favor=True, p=0.5, default_conf=True)
    score = rouge.calc_score()
    print(score)
예제 #4
0
 def compute_rouge(candidate: typing.List[int]) -> float:
     hypothesis = [doc_sents[sid] for sid in sorted(candidate)]
     rouge = Pythonrouge(
         summary_file_exist=False, summary=[hypothesis], reference=[[ref_sents]],
         stemming=False, ROUGE_SU4=False)
     score = rouge.calc_score()
     return score['ROUGE-1-F']
def evaluate(gen_summary, ref_summary, genref_summaries):
    references = []
    ref_subbed_sentences = re.sub(r'(@)', r'_\1_', ref_summary)
    ref_sentences = nltk.word_tokenize(ref_subbed_sentences)
    references.append(ref_sentences)

    for summary in genref_summaries:
        ref_subbed_sentences = re.sub(r'(@)', r'_\1_', summary)
        ref_sentences = nltk.word_tokenize(ref_subbed_sentences)
        references.append(ref_sentences)

    gen_subbed_sentences = re.sub(r'(@)', r'_\1_', gen_summary)
    gen_sentences = nltk.word_tokenize(gen_subbed_sentences)
    bleu_score = corpus_bleu([references], [gen_sentences], weights=(1, 0))

    rouge = Pythonrouge(summary_file_exist=False,
                        summary=[gen_sentences],
                        reference=[references],
                        n_gram=1,
                        ROUGE_SU4=True,
                        ROUGE_L=False,
                        recall_only=False,
                        stemming=True,
                        stopwords=True,
                        word_level=True,
                        use_cf=False,
                        cf=95,
                        scoring_formula='best',
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5)
    rouge_score = rouge.calc_score()
    return bleu_score, rouge_score
def RougeEvaluation(refFile, summary_sentences_list):
    file_open = open(refFile, "r")
    gold_standard = tokenizer.tokenize(file_open.read())
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=[summary_sentences_list],
                        reference=[[gold_standard]],
                        n_gram=2,
                        ROUGE_SU4=False,
                        ROUGE_L=True,
                        recall_only=False,
                        stemming=True,
                        stopwords=False,
                        word_level=True,
                        length_limit=True,
                        length=5000,
                        use_cf=False,
                        cf=95,
                        scoring_formula='average',
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5)
    score = rouge.calc_score()
    print(score)
    print '\n'
예제 #7
0
def sentence_rouge(reflex, genlex):
    rouge = Pythonrouge(n_gram=2, ROUGE_SU4=True, ROUGE_L=True, stemming=True, stopwords=True, word_level=True, length_limit=True, \
            length=50, use_cf=False, cf=95, scoring_formula="average", resampling=True, samples=1000, favor=True, p=0.5)
    genlex = [[genlex,]]
    reflex = [[[reflex,]]]
    setting_file = rouge.setting(files=False, summary=genlex, reference=reflex)
    result = rouge.eval_rouge(setting_file, recall_only=False, ROUGE_path=ROUGE_path, data_path=data_path)
    return result['ROUGE-L-F']
 def similarity_rouge(self, s1, s2):
     rouge = Pythonrouge(summary_file_exist=False,
                         summary=s1, reference=s2,
                         n_gram=2, ROUGE_SU4=True, ROUGE_L=True,
                         recall_only=True, stemming=False, stopwords=False,
                         word_level=True, length_limit=True, length=50,
                         use_cf=False, cf=95, scoring_formula='average',
                         resampling=True, samples=1000, favor=True, p=0.5)
     ROUGE_score = rouge.calc_score()
     return list(ROUGE_score.values())[2]
예제 #9
0
def eval_indiv_rouge(hyp_dir, ref_dir):
    print(hyp_dir, 'vs', ref_dir)
    # create tmp dirs with common cases
    hyp_set = set(os.listdir(hyp_dir))
    ref_set = set(os.listdir(ref_dir))
    common_set = hyp_set.intersection(ref_set)

    scores = collections.defaultdict(list)

    for casefile in common_set:
        tmp_hyp = './tmp_hyp'
        tmp_ref = './tmp_ref'
        os.mkdir(tmp_hyp)
        os.mkdir(tmp_ref)

        # copy to tmp dirs
        shutil.copyfile(os.path.join(hyp_dir, casefile),
                        os.path.join(tmp_hyp, casefile))
        shutil.copyfile(os.path.join(ref_dir, casefile),
                        os.path.join(tmp_ref, casefile))

        assert os.listdir(tmp_hyp) == os.listdir(tmp_ref)
        rouge = Pythonrouge(summary_file_exist=True,
                            peer_path=tmp_hyp,
                            model_path=tmp_ref,
                            n_gram=2,
                            ROUGE_SU4=True,
                            ROUGE_L=True,
                            recall_only=True,
                            stemming=True,
                            stopwords=True,
                            word_level=True,
                            length_limit=True,
                            length=50,
                            use_cf=False,
                            cf=95,
                            scoring_formula='average',
                            resampling=True,
                            samples=1000,
                            favor=True,
                            p=0.5)
        score = rouge.calc_score()
        print(casefile, score)
        for key in score:
            scores[key].append(score[key])

        shutil.rmtree(tmp_hyp)
        shutil.rmtree(tmp_ref)

    print([
        "%s: mean %.3f std %.3f" %
        (key, round(sum(scores[key]) / len(scores[key]),
                    3), round(statistics.stdev(scores[key]), 3))
        for key in scores
    ])
예제 #10
0
def rouge_sent(predict_path, golden_path):

    import spacy
    nlp = spacy.load('en_core_web_sm')

    with open(predict_path, 'r') as rf:
        predict_summary = rf.read().replace('\n', ' ')
    with open(golden_path, 'r') as rf:
        golden_summary = rf.read().replace('\n', ' ')

    doc = nlp(predict_summary)
    predict_sent_list = [sent.text for sent in doc.sents]

    doc = nlp(golden_summary)
    golden_sent_list = [sent.text for sent in doc.sents]

    max_rouge_1 = -99999
    max_rouge_2 = -99999

    for p_idx, predict_sent in enumerate(predict_sent_list):
        for g_idx, golden_sent in enumerate(golden_sent_list):

            # initialize setting of ROUGE to eval ROUGE-1, 2, SU4
            # if you evaluate ROUGE by sentence list as above, set summary_file_exist=False
            # if recall_only=True, you can get recall scores of ROUGE
            rouge = Pythonrouge(summary_file_exist=False,
                                summary=[[predict_sent]], reference=[[[golden_sent]]],
                                n_gram=2, ROUGE_SU4=True, ROUGE_L=True,
                                recall_only=True, stemming=True, stopwords=True,
                                word_level=True, length_limit=True, length=50,
                                use_cf=False, cf=95, scoring_formula='average',
                                resampling=True, samples=1000, favor=True, p=0.5)
            score = rouge.calc_score()

            if score['ROUGE-1'] > max_rouge_1:
                max_rouge_1 = score['ROUGE-1']
                p_idx_rouge_1_save = p_idx
                g_idx_rouge_1_save = g_idx

            if score['ROUGE-2'] > max_rouge_2:
                max_rouge_2 = score['ROUGE-2']
                p_idx_rouge_2_save = p_idx
                g_idx_rouge_2_save = g_idx

    print("Max ROUGE-1 score among sentences: %.5f" % max_rouge_1)
    print("[Predicted Sentence]\n%s" % predict_sent_list[p_idx_rouge_1_save])
    print("[Golden Sentence]\n%s" % golden_sent_list[g_idx_rouge_1_save])

    print()

    print("Max ROUGE-2 score among sentences: %.5f" % max_rouge_2)
    print("[Predicted Sentence]\n%s" % predict_sent_list[p_idx_rouge_2_save])
    print("[Golden Sentence]\n%s" % golden_sent_list[g_idx_rouge_2_save])
예제 #11
0
def calculate_rouge_scores(summaries,
                           references,
                           max_length,
                           root=None,
                           global_step=None):
    # command to install pythonrouge: pip install git+https://github.com/tagucci/pythonrouge.git
    from pythonrouge.pythonrouge import Pythonrouge

    logging.info('calculate ROUGE scores of %d summaries', len(summaries))
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=summaries,
                        reference=references,
                        n_gram=2,
                        ROUGE_SU4=False,
                        ROUGE_L=True,
                        recall_only=False,
                        stemming=True,
                        stopwords=False,
                        word_level=True,
                        length_limit=max_length is not None,
                        length=max_length,
                        use_cf=False,
                        cf=95,
                        scoring_formula='average',
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5)
    score = rouge.calc_score()
    logging.info('ROUGE(1/2/L) Scores:')
    logging.info('>   ROUGE-1-R/F1: %f / %f', score['ROUGE-1-R'],
                 score['ROUGE-1-F'])
    logging.info('>   ROUGE-2-R/F1: %f / %f', score['ROUGE-2-R'],
                 score['ROUGE-2-F'])
    logging.info('>   ROUGE-L-R/F1: %f / %f', score['ROUGE-L-R'],
                 score['ROUGE-L-F'])
    avg_token_count = sum(
        len(' '.join(summary).split())
        for summary in summaries) / len(summaries)
    avg_token_count_ref = sum(
        len(' '.join(summary[0]).split())
        for summary in references) / len(references)
    logging.info('>   averageToken: %f / %f', avg_token_count,
                 avg_token_count_ref)

    if root is not None and global_step is not None:
        for key in ['ROUGE-1-F', 'ROUGE-2-F']:
            swriter = tf.summary.FileWriter(os.path.join(root, key))
            summary = tf.Summary(value=[
                tf.Summary.Value(tag='ROUGE(F1)', simple_value=score[key])
            ])
            swriter.add_summary(summary, global_step)
            swriter.close()
예제 #12
0
def get_rouge(input_sentences, summary, references_dir, order=None, verbose=0):
    """
    Calculate ROUGE scores for generated summaries and references.
    :param input_sentences: unmodified input sentences to calculate extractivness
    :param summary: list of summaries
    :param references_dir: contains reference files
    :param order: order during loading input
    :param verbose: verbosity
    :return: ROUGE scores
    """
    if order is None:
        order = range(len(summary))
    reference_filenames = os.listdir(references_dir)
    references_all = [[] for _ in reference_filenames]
    for i, reference_filename in enumerate(reference_filenames):
        with open(os.path.join(references_dir, reference_filename)) as f:
            for line in f:
                references_all[i].append([line.rstrip()])
    references = [[references_list[o] for references_list in references_all] for o in order]

    if verbose > 0:
        for i, s, r_list in zip(input_sentences, summary, references):
            print('input, generated sentence and references:')
            print('{}'.format(i))
            print('{}'.format(s))
            for r in r_list:
                print('{}'.format(r))
            print('')
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=summary, reference=references,
                        n_gram=2, ROUGE_SU4=False, ROUGE_L=True,
                        recall_only=True,
                        stemming=True, stopwords=False,
                        word_level=False, length_limit=True, length=75,
                        use_cf=True, cf=95, scoring_formula='average',
                        resampling=True, samples=1000, favor=True, p=0.5)
    scores = rouge.calc_score()
    logger.info('ROUGE-1: {ROUGE-1:.4f}  ROUGE-2: {ROUGE-2:.4f}  ROUGE-L: {ROUGE-L:.4f}'.format(**scores))

    words_total = 0
    words_ext = 0
    for i, s in zip(input_sentences, summary):
        words_total += len(set(s[0].split(' ')))
        words_ext += len(set(s[0].split(' ')) & set(i.split(' ')))
    logger.info('{0:.2f}% extractive'.format(words_ext / float(words_total)))

    plt.hist([len(s[0]) for s in summary], bins=30)
    plt.xlabel('output characters', fontsize=11)
    plt.savefig('plot.png')

    return scores
예제 #13
0
def eval_rouge(hyp_dir, ref_dir):
    print(hyp_dir, 'vs', ref_dir)
    # create tmp dirs with common cases
    hyp_set = set(os.listdir(hyp_dir))
    ref_set = set(os.listdir(ref_dir))
    common_set = hyp_set.intersection(ref_set)
    # print(len(hyp_set))
    # print(len(ref_set))
    # print(len(common_set))

    tmp_hyp = './tmp_hyp'
    tmp_ref = './tmp_ref'

    if not os.path.isdir(tmp_hyp):
        os.mkdir(tmp_hyp)
    if not os.path.isdir(tmp_ref):
        os.mkdir(tmp_ref)

    # copy to tmp dirs
    for casefile in common_set:
        shutil.copyfile(os.path.join(hyp_dir, casefile),
                        os.path.join(tmp_hyp, casefile))
        shutil.copyfile(os.path.join(ref_dir, casefile),
                        os.path.join(tmp_ref, casefile))

    assert os.listdir(tmp_hyp) == os.listdir(tmp_ref)
    rouge = Pythonrouge(summary_file_exist=True,
                        peer_path=tmp_hyp,
                        model_path=tmp_ref,
                        n_gram=2,
                        ROUGE_SU4=True,
                        ROUGE_L=True,
                        recall_only=True,
                        stemming=True,
                        stopwords=True,
                        word_level=True,
                        length_limit=True,
                        length=50,
                        use_cf=False,
                        cf=95,
                        scoring_formula='average',
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5)
    score = rouge.calc_score()
    print(score)

    shutil.rmtree(tmp_hyp)
    shutil.rmtree(tmp_ref)
예제 #14
0
def rouge_sent(predict_summary, golden_summary):

    import spacy
    nlp = spacy.load('en_core_web_sm')
    predict_summary = predict_summary.replace('\n', ' ')
    golden_summary = golden_summary.replace('\n', ' ')

    doc = nlp(predict_summary)
    predict_sent_list = [sent.text for sent in doc.sents]

    doc = nlp(golden_summary)
    golden_sent_list = [sent.text for sent in doc.sents]

    max_rouge_1 = -99999
    max_rouge_2 = -99999

    for p_idx, predict_sent in enumerate(predict_sent_list):
        for g_idx, golden_sent in enumerate(golden_sent_list):

            # initialize setting of ROUGE to eval ROUGE-1, 2, SU4
            # if you evaluate ROUGE by sentence list as above, set summary_file_exist=False
            # if recall_only=True, you can get recall scores of ROUGE
            rouge = Pythonrouge(summary_file_exist=False,
                                summary=[[predict_sent]],
                                reference=[[[golden_sent]]],
                                n_gram=2,
                                ROUGE_SU4=True,
                                ROUGE_L=True,
                                recall_only=True,
                                stemming=True,
                                stopwords=True,
                                word_level=True,
                                length_limit=True,
                                length=50,
                                use_cf=False,
                                cf=95,
                                scoring_formula='average',
                                resampling=True,
                                samples=1000,
                                favor=True,
                                p=0.5)
            score = rouge.calc_score()

            if score['ROUGE-1'] > max_rouge_1:
                max_rouge_1 = score['ROUGE-1']

            if score['ROUGE-2'] > max_rouge_2:
                max_rouge_2 = score['ROUGE-2']

    return {"ROUGE-1": max_rouge_1, "ROUGE-2": max_rouge_2}
예제 #15
0
파일: evaluate.py 프로젝트: Saichethan/GDES
def eval_str(cs, ref):

    sci_text = str(cs).replace("\n", " ")
    gold_text = str(ref).replace("\n", " ")

    ref_summary = gold_text
    ref_bleu = []
    ref_bleu.append(gold_text.split(" "))

    reference = []
    reference.append([[gold_text]])

    cs_bleu = sci_text.split(" ")

    b = []

    b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(1, 0, 0, 0)))  #1 gram
    b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(0, 1, 0, 0)))  #2 gram
    b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(0, 0, 1, 0)))  #3 gram
    b.append(sentence_bleu(ref_bleu, cs_bleu, weights=(0, 0, 0, 1)))  #4 gram

    answer = []
    answer.append([sci_text])

    r = Pythonrouge(summary_file_exist=False,
                    summary=answer,
                    reference=reference,
                    n_gram=2,
                    ROUGE_SU4=False,
                    ROUGE_L=True,
                    recall_only=False,
                    stemming=True,
                    stopwords=False,
                    word_level=True,
                    length_limit=True,
                    length=600,
                    use_cf=False,
                    cf=95,
                    scoring_formula='best',
                    resampling=True,
                    samples=1,
                    favor=True,
                    p=0.5)

    score = r.calc_score()

    return b[0], b[1], b[2], b[3], score["ROUGE-1-P"], score[
        "ROUGE-1-R"], score["ROUGE-1-F"], score["ROUGE-2-P"], score[
            "ROUGE-2-R"], score["ROUGE-2-F"], score["ROUGE-L-P"], score[
                "ROUGE-L-R"], score["ROUGE-L-F"]
 def __init__(self, csv_file, type_reward):
     self.csv_file = csv_file
     self.type_reward = type_reward  # \in {rouge-1, rouge-2, rouge-l, rouge-avg} #
     self.evaluator = Pythonrouge(summary_file_exist=False,
                                  delete_xml=True,
                                  summary=[],
                                  reference=[],
                                  n_gram=2,
                                  ROUGE_SU4=False,
                                  ROUGE_L=True,
                                  f_measure_only=True,
                                  stemming=True,
                                  stopwords=False,
                                  word_level=True,
                                  length_limit=False)
예제 #17
0
def rouge_para(predict_path, golden_path):

    # initialize setting of ROUGE, eval ROUGE-1, 2, SU4
    # if summary_file_exis=True, you should specify predict summary(peer_path) and golden summary(model_path) paths
    rouge = Pythonrouge(summary_file_exist=True,
                        peer_path=predict_path, model_path=golden_path,
                        n_gram=2, ROUGE_SU4=True, ROUGE_L=True,
                        recall_only=True,
                        stemming=True, stopwords=True,
                        word_level=True, length_limit=True, length=50,
                        use_cf=False, cf=95, scoring_formula='average',
                        resampling=True, samples=1000, favor=True, p=0.5)

    score = rouge.calc_score()
    print(score)
def evaluate_rouge_scores(evaluation_file_name):
    summaries = []  # model-generated
    references = []  # human-generated
    # articles = {}
    with gzip.open(evaluation_file_name) as json_file:
        json_data = json_file.read()
        data = json.loads(json_data)
        print("%d entries..." % len(data))
        for example in data:
            # datum = example['data']
            # if not datum in articles:
            # articles[datum] = True
            summaries.append(
                remove_tags(example['prediction']).encode('utf-8').split())
            references.append([
                remove_tags(example).encode('utf-8').split()
                for example in example['label']
            ])
    print("%d entries are used for evaluation." % len(summaries))
    # DEBUG: print a couple examples and their respective ROUGE scores
    # print(zip(summaries[5:10], references[5:10]))
    # rouge = Pythonrouge(n_gram=2, ROUGE_SU4=False, ROUGE_L=True, stemming=False, stopwords=False, word_level=True, length_limit=False, length=50, use_cf=True, cf=95, scoring_formula="average", resampling=False, samples=500, favor=False, p=0.5)
    # setting_file = rouge.setting(files=False, summary=summaries[5:10], reference=references[5:10])
    # print(rouge.eval_rouge(setting_file, recall_only=False, ROUGE_path=ROUGE_PATH, data_path=ROUGE_DATA, f_measure_only=False))
    rouge = Pythonrouge(n_gram=2,
                        ROUGE_SU4=False,
                        ROUGE_L=True,
                        stemming=False,
                        stopwords=False,
                        word_level=True,
                        length_limit=False,
                        length=50,
                        use_cf=True,
                        cf=95,
                        scoring_formula="average",
                        resampling=False,
                        samples=500,
                        favor=False,
                        p=0.5)
    setting_file = rouge.setting(files=False,
                                 summary=summaries,
                                 reference=references)
    result = rouge.eval_rouge(setting_file,
                              recall_only=False,
                              ROUGE_path=ROUGE_PATH,
                              data_path=ROUGE_DATA,
                              f_measure_only=False)
    return result
예제 #19
0
def rouge(hyp, ref, n=None):
    # 1 - 4, L
    hyp, ref = " ".join(hyp), " ".join(ref)
    ret = Pythonrouge(summary_file_exist=False,
                      summary=[[hyp]],
                      reference=[[[ref]]],
                      n_gram=4,
                      ROUGE_SU4=True,
                      ROUGE_L=True,
                      recall_only=True,
                      stemming=True,
                      stopwords=True,
                      word_level=True,
                      length_limit=True,
                      length=50,
                      use_cf=False,
                      cf=95,
                      scoring_formula='average',
                      resampling=True,
                      samples=1000,
                      favor=True,
                      p=0.5).calc_score()

    if n is None: return ret
    else: return ret["rouge-" + n]["f"]
예제 #20
0
def compute_perl_rouge(hyp_list, ref_list):
    def clean(x):
        return re.sub(r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''",
                      lambda m: perl_remap.get(m.group()), x)

    def preprocess(doc_list, reference=False):
        doc_list_pre = []
        if reference:
            doc_list = doc_list[0]
        for sent_str in doc_list:
            sent_str = clean(sent_str.lower())
            doc_list_pre.append(sent_str)
        if reference:
            doc_list_pre = [doc_list_pre]
        return doc_list_pre

    rouge = Pythonrouge(
        summary_file_exist=False,
        summary=[preprocess(hyp) for hyp in hyp_list],
        reference=[preprocess(ref, reference=True) for ref in ref_list],
        n_gram=2,
        ROUGE_SU4=False,
        ROUGE_L=True,
        ROUGE_W=False,
        ROUGE_W_Weight=1.2,
        recall_only=False,
        f_measure_only=False,
        stemming=True,
        stopwords=False,
        word_level=False,
        length_limit=False,
        length=50,
        use_cf=True,
        cf=95,
        scoring_formula='average',
        resampling=True,
        samples=1000,
        favor=False,
        p=0.5,
    )

    scores = rouge.calc_score()
    rouge_1_f = round(scores['ROUGE-1-F'] * 100., 2)
    rouge_2_f = round(scores['ROUGE-2-F'] * 100., 2)
    rouge_l_f = round(scores['ROUGE-L-F'] * 100., 2)
    return (rouge_1_f, rouge_2_f, rouge_l_f)
def oracle_baseline(path_file):
    with open(path_file, 'r') as fd:
        lines = fd.read().splitlines()
    pred_str_bag, ref_str_bag = [], []
    for l in lines:
        name, doc, abst, span_info, gold = l.split('\t')
        doc = doc.split()
        span_info = [int(w) for w in span_info.split()]
        idx_in_span = list(zip(span_info[0::2], span_info[1::2]))
        gold_label = [int(l) for l in gold.split()]

        abs_str = abst.replace("@@SS@@", "\n").split("\n")
        abs_str = [x for x in abs_str if len(x) > 1]
        _buff = []
        for g in gold_label:
            content = doc[idx_in_span[g][0]:idx_in_span[g][1] + 1]
            _buff.append(' '.join(content).replace("@@SS@@", ""))

        pred_str_bag.append(_buff)
        ref_str_bag.append([abs_str])
    print('Finish reading')
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=pred_str_bag,
                        reference=ref_str_bag,
                        n_gram=2,
                        ROUGE_SU4=True,
                        ROUGE_L=True,
                        ROUGE_W=True,
                        ROUGE_W_Weight=1.2,
                        recall_only=False,
                        stemming=True,
                        stopwords=False,
                        word_level=True,
                        length_limit=False,
                        length=50,
                        use_cf=False,
                        cf=95,
                        scoring_formula='average',
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5,
                        default_conf=True)
    score = rouge.calc_score()
    print(score)
예제 #22
0
def liPaperEvaluation(pred_y, true_y):
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=pred_y,
                        reference=true_y,
                        f_measure_only=True,
                        n_gram=2,
                        ROUGE_SU4=False,
                        ROUGE_L=True,
                        stemming=False,
                        stopwords=False,
                        word_level=False,
                        length_limit=False,
                        use_cf=False,
                        cf=95,
                        scoring_formula="average",
                        resampling=True,
                        samples=1000)
    result = rouge.calc_score()
    print(result)
예제 #23
0
 def __get_score(self,predicted_summ,gold_summ,**kwargs):
     summary=gold_summ
     reference=predicted_summ
     self.rouge = Pythonrouge(summary_file_exist=False,
                     summary=summary, reference=reference,
                     n_gram=3, ROUGE_SU4=False, ROUGE_L=True,
                     recall_only=True, stemming=True, stopwords=True,
                     word_level=True, length_limit=True, length=50,
                     use_cf=False, cf=95, scoring_formula='average',
                     resampling=True, samples=1000, favor=True, p=0.5)
     self.rouge.summary = gold_summ
     self.rouge.reference = predicted_summ
     score = self.rouge.calc_score()
     score=score['ROUGE-1'] + score['ROUGE-2']*5 + score['ROUGE-3']*2 +score['ROUGE-L']*2
     config = kwargs['config']
     score = kwargs['prev_score']- score 
     if (score > config.dqn_options.ERROR_THRESH):
         return score
     return -1
예제 #24
0
def evaluate(system_summary,
             reference_summaries,
             stemming=False,
             stopwords=False,
             use_cf=False,
             ngram=2):
    ROUGE_path = "rouge_files/ROUGE-1.5.5/ROUGE-1.5.5.pl"
    data_path = "rouge_files/ROUGE-1.5.5/data/"

    # initialize setting of ROUGE, eval ROUGE-1, 2
    rouge = Pythonrouge(n_gram=ngram,
                        ROUGE_SU4=False,
                        ROUGE_L=False,
                        stemming=stemming,
                        stopwords=stopwords,
                        word_level=True,
                        length_limit=True,
                        length=100,
                        use_cf=use_cf,
                        cf=95,
                        scoring_formula="average",
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5)

    # system summary: list of summaries, where each summary is a list of sentences
    summary = [system_summary]

    # reference summaries: list of (list of summaries per article), where each summary is a list of sentences
    reference = [[[summary] for summary in reference_summaries]]

    setting_file = rouge.setting(files=False,
                                 summary=summary,
                                 reference=reference,
                                 temp_root='')

    result = rouge.eval_rouge(setting_file,
                              ROUGE_path=ROUGE_path,
                              data_path=data_path)

    return result
예제 #25
0
def rouge_protocol(list_of_pred, list_of_reference):
    """
    # summary: double list
        summary = [[summaryA_sent1, summaryA_sent2],
                   [summaryB_sent1, summaryB_sent2]]
    # reference: triple list
    reference = [[[summaryA_ref1_sent1, summaryA_ref1_sent2],
                     [summaryA_ref2_sent1, summaryA_ref2_sent2]],
                     [[summaryB_ref1_sent1, summaryB_ref1_sent2],
                     [summaryB_ref2_sent1, summaryB_ref2_sent2]]
    :param list_of_pred: [[]]
    :param list_of_reference:[[[]]]
    :return:
    """
    # print(list_of_pred, list_of_reference)
    if (not isinstance(list_of_pred, List)) or (not isinstance(
            list_of_reference, List)):
        raise TypeError("Input should be list.")
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=list_of_pred,
                        reference=list_of_reference,
                        n_gram=2,
                        ROUGE_SU4=True,
                        ROUGE_L=True,
                        ROUGE_W=True,
                        ROUGE_W_Weight=1.2,
                        recall_only=False,
                        stemming=True,
                        stopwords=False,
                        word_level=True,
                        length_limit=False,
                        length=50,
                        use_cf=False,
                        cf=95,
                        scoring_formula='average',
                        resampling=True,
                        samples=1000,
                        favor=True,
                        p=0.5,
                        default_conf=True)
    score = rouge.calc_score()
    return score
예제 #26
0
파일: utils.py 프로젝트: MinhajulMU/mysum
def eval_summaries(summaries, docs, logger=None, encoding='utf-8', delete_temps=True):
    if logger is None:
        logger = logging.getLogger(__name__)

    references = []
    hypotheses = []
    for summary, doc in zip(summaries, docs):
        refs = [[' '.join(sent) for sent in doc.summary]]
        hyp = [' '.join(doc.sentences[idx].words) for idx in summary]
        references.append(refs)
        hypotheses.append(hyp)

    assert len(references) == len(hypotheses), 'Number of references and hypotheses mismatch'

    ref_dirname = tempfile.mkdtemp()
    logger.info('References directory: %s', ref_dirname)
    hyp_dirname = tempfile.mkdtemp()
    logger.info('Hypotheses directory: %s', hyp_dirname)
    for doc_id, (refs, hyp) in enumerate(zip(references, hypotheses)):
        # Write references
        for rid, ref in enumerate(refs):
            ref_filename = os.path.join(ref_dirname, f'{doc_id}.{rid}.txt')
            with open(ref_filename, 'w', encoding=encoding) as f:
                print('\n'.join(ref), file=f)
        # Write hypothesis
        hyp_filename = os.path.join(hyp_dirname, f'{doc_id}.txt')
        with open(hyp_filename, 'w', encoding=encoding) as f:
            print('\n'.join(hyp), file=f)

    rouge = Pythonrouge(
        peer_path=hyp_dirname, model_path=ref_dirname, stemming=False, ROUGE_L=True,
        ROUGE_SU4=False)
    score = rouge.calc_score()
    logger.info('ROUGE scores: %s', score)

    if delete_temps:
        logger.info('Deleting temporary files and directories')
        shutil.rmtree(ref_dirname)
        shutil.rmtree(hyp_dirname)

    return score
예제 #27
0
파일: dtitle.py 프로젝트: gang4gh/dl
    def _calculate_rouge_scores(self, summaries, references, max_length=None):
        # command to install pythonrouge: pip install git+https://github.com/tagucci/pythonrouge.git
        from pythonrouge.pythonrouge import Pythonrouge

        logging.info('calculate ROUGE scores of %d summaries', len(summaries))
        rouge = Pythonrouge(summary_file_exist=False,
                            summary=summaries,
                            reference=references,
                            n_gram=2,
                            ROUGE_SU4=False,
                            ROUGE_L=True,
                            recall_only=False,
                            stemming=True,
                            stopwords=False,
                            word_level=True,
                            length_limit=max_length is not None,
                            length=max_length,
                            use_cf=False,
                            cf=95,
                            scoring_formula='average',
                            resampling=True,
                            samples=1000,
                            favor=True,
                            p=0.5)
        scores = rouge.calc_score()
        logging.info('ROUGE(1/2/L) Scores:')
        logging.info('>   ROUGE-1-R/F1: %f / %f', scores['ROUGE-1-R'],
                     scores['ROUGE-1-F'])
        logging.info('>   ROUGE-2-R/F1: %f / %f', scores['ROUGE-2-R'],
                     scores['ROUGE-2-F'])
        logging.info('>   ROUGE-L-R/F1: %f / %f', scores['ROUGE-L-R'],
                     scores['ROUGE-L-F'])
        avg_token_count = sum(
            len(' '.join(summary).split())
            for summary in summaries) / len(summaries)
        avg_token_count_ref = sum(
            len(' '.join(summary[0]).split())
            for summary in references) / len(references)
        logging.info('>   averageToken: %f / %f', avg_token_count,
                     avg_token_count_ref)
        return scores
예제 #28
0
 def calc_item_rouge(item):
     """
     计算单个rouge
     :return:
     """
     para = item['data']
     # print(para)
     labels = item['label']
     pattern = '<s>([^<]*)</s>'
     sentences = re.findall(pattern, para)
     # print(len(sentences))
     reference = [re.findall(pattern, i)[0] for i in labels]
     # print(len(labels))
     all_sentences = sentences + reference
     ref = [[reference]]
     res = []
     for i in all_sentences:
         summary = [[i]]
         rouge = Pythonrouge(summary_file_exist=False,
                             summary=summary,
                             reference=ref,
                             n_gram=2,
                             ROUGE_SU4=True,
                             ROUGE_L=False,
                             recall_only=True,
                             stemming=True,
                             stopwords=True,
                             word_level=True,
                             length_limit=True,
                             length=50,
                             use_cf=False,
                             cf=95,
                             scoring_formula='average',
                             resampling=True,
                             samples=1000,
                             favor=True,
                             p=0.5)
         score = rouge.calc_score()
         print(i, score)
         res.append((i, score))
     return res
예제 #29
0
def evaluation(model,data_loader,word_to_idx,idx_to_word):
    summary = []
    reference = []
    count = 0
    for data in data_loader:
        reference.append([])
        reference[count].append(data[1])
        output = sample(model,data[0],data[2],word_to_idx,idx_to_word)
        summary.append([output])
        count +=1
        print(output) 

    rouge = Pythonrouge(summary_file_exist=False,
                        summary=summary, reference=reference,
                        n_gram=2, ROUGE_SU4=True, ROUGE_L=False,
                        recall_only=True, stemming=True, stopwords=True,
                        word_level=True, length_limit=True, length=50,
                        use_cf=False, cf=95, scoring_formula='average',
                        resampling=True, samples=1000, favor=True, p=0.5)
    score = rouge.calc_score()
    print(score)
예제 #30
0
def compute_perl_scores(reference, summary, stem, remove_stop):
    rouge = Pythonrouge(summary_file_exist=False,
                        summary=[[summary]],
                        reference=[[[reference]]],
                        n_gram=3,
                        ROUGE_SU4=False,
                        ROUGE_L=True,
                        recall_only=False,
                        stemming=stem,
                        stopwords=remove_stop,
                        word_level=True,
                        length_limit=False,
                        length=150,
                        use_cf=False,
                        cf=95,
                        scoring_formula='average',
                        resampling=False,
                        samples=1000,
                        favor=True,
                        p=0.5)
    return rouge.calc_score()