def testRougeLNonConsecutive(self): scorer = rouge_scorer.RougeScorer(["rougeL"]) result = scorer.score("testing one two", "testing two") self.assertAlmostEqual(1, result["rougeL"].precision) self.assertAlmostEqual(2 / 3, result["rougeL"].recall) self.assertAlmostEqual(4 / 5, result["rougeL"].fmeasure)
def testInvalidRougeTypes(self, rouge_type): with self.assertRaises(ValueError): scorer = rouge_scorer.RougeScorer([rouge_type]) scorer.score("testing one two", "testing")
def testRougeEmpty(self, rouge_type): scorer = rouge_scorer.RougeScorer([rouge_type]) result = scorer.score("testing one two", "") self.assertAlmostEqual(0, result[rouge_type].precision) self.assertAlmostEqual(0, result[rouge_type].recall) self.assertAlmostEqual(0, result[rouge_type].fmeasure)
def testRouge2(self): scorer = rouge_scorer.RougeScorer(["rouge2"]) result = scorer.score("testing one two", "testing one") self.assertAlmostEqual(1, result["rouge2"].precision) self.assertAlmostEqual(1 / 2, result["rouge2"].recall) self.assertAlmostEqual(2 / 3, result["rouge2"].fmeasure)
def testValidRougeTypes(self, rouge_type): scorer = rouge_scorer.RougeScorer([rouge_type]) result = scorer.score("testing one two", "testing") self.assertSameElements(list(result.keys()), [rouge_type])
from rouge import rouge_scorer from rouge import scoring from summae import p2s_eval from summae import util FLAGS = flags.FLAGS flags.DEFINE_string('data_dir', '.', 'Data directory.') flags.DEFINE_string('eval_subset', 'test', 'which subset (valid/test) to eval/decode.') flags.DEFINE_string('output_dir', '/tmp/12342', 'local directory to save extractive oracle') flags.DEFINE_string('vocab_file', '', 'Subword vocab file.') # for detok first sentence my_rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) def get_extracts(s): # get 5 sentences as the extractive baselines sents = s.feature_lists.feature_list['untokenized_sentences'].feature assert len(sents) == 5 return tuple([sents[i].bytes_list.value[0] for i in range(5)]) def human_ave(summ_list): """Average pairwise rouge between two human summaries.""" agg = scoring.BootstrapAggregator() for s1_id, s1 in enumerate(summ_list): for s2_id, s2 in enumerate(summ_list): if s1_id >= s2_id: # only compute for s1_id < s2_id
def testAssertsOnInvalidInputFiles(self): scorer = rouge_scorer.RougeScorer(["rouge1"], False) with self.assertRaises(ValueError): io.compute_scores_and_write_to_csv("invalid*", "invalid*", "invalid", scorer, scoring.BootstrapAggregator())
def testRouge1Multi(self): scorer = rouge_scorer.RougeScorer(["rouge1"]) result = scorer.score_multi(["testing one two"], "testing") self.assertAlmostEqual(1, result["rouge1"].precision) self.assertAlmostEqual(1 / 3, result["rouge1"].recall) self.assertAlmostEqual(1 / 2, result["rouge1"].fmeasure)
def __init__(self): self.metrics = ['rouge1', 'rouge2', 'rougeL'] self.main_metric = 'rougeL' self.scorer = rouge_scorer.RougeScorer(self.metrics, use_stemmer=True)