Пример #1
0
def evaluate_summarization(test_set: dict, vocabulary: dict, word_cnt,
                           ref_summaries: dict, elim_stopwords, lemmatization,
                           bigrams):
    my_summaries = {}
    for key in test_set:
        my_summaries[key] = []
        for doc in test_set[key]:
            summary = []
            for sentence in doc:
                (result,
                 prob) = predict_sentence_in_sum(vocabulary, sentence,
                                                 word_cnt, 1, elim_stopwords,
                                                 lemmatization, bigrams)
                if result == "yes":
                    summary.append(sentence)
            my_summaries[key].append(summary)
    if bigrams:
        scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=False)
    else:
        scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=False)
    avg_p = 0
    avg_r = 0
    avg_f = 0
    total = 0
    for key in test_set:
        for mysum, ref in zip(my_summaries[key], ref_summaries[key]):
            if bigrams:
                score = scorer.score(" ".join(ref), " ".join(mysum))['rouge2']
            else:
                score = scorer.score(" ".join(ref), " ".join(mysum))['rouge1']
            avg_p += score.precision
            avg_r += score.recall
            avg_f += score.fmeasure
            total += 1
    return avg_p / total, avg_r / total, avg_f / total
Пример #2
0
 def create_rouge_scorer(self):
     if self.config is None:
         self.rouge_scorer = rouge_scorer.RougeScorer(
             ['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=False)
         self.lemmatized_rouge_scorer = rouge_scorer.RougeScorer(
             ['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=True)
     else:
         raise NotImplementedError(
             "Rouge Scorer has not been written yet to handle a config object."
         )
Пример #3
0
def get_rouge_scores(target_summaries_path,
                     predicted_summaries_path,
                     use_stem=False):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                      use_stemmer=use_stem)

    target_summary_files = [
        x for x in Path(target_summaries_path).rglob('*.summary')
    ]

    rouge1 = []
    rouge2 = []
    rougel = []

    for target_summary_path in target_summary_files:
        basename = Path(target_summary_path).stem
        with open(target_summary_path, 'r') as f:
            target_summary = f.read()

        predicted_summary_path = f'{predicted_summaries_path}/{basename}.summary'

        if not os.path.exists(predicted_summary_path):
            continue

        with open(predicted_summary_path, 'r') as f:
            predicted_summary = f.read()

        output = scorer.score(target_summary, predicted_summary)

        rouge1 += [output['rouge1'].fmeasure]
        rouge2 += [output['rouge2'].fmeasure]
        rougel += [output['rougeL'].fmeasure]

    return sum(rouge1) / len(rouge1) * 100, sum(rouge2) / len(
        rouge2) * 100, sum(rougel) / len(rougel) * 100
Пример #4
0
 def _compute(self,
              preds,
              refs,
              rouge_types=None,
              use_agregator=True,
              use_stemmer=False):
     if rouge_types is None:
         rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
     scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                       use_stemmer=use_stemmer)
     if use_agregator:
         aggregator = scoring.BootstrapAggregator()
     else:
         scores = []
     for r, p in zip(refs, preds):
         score = scorer.score(r, p)
         if use_agregator:
             aggregator.add_scores(score)
         else:
             scores.append(score)
     if use_agregator:
         y = aggregator.aggregate()
     else:
         y = {}
         for k in scores[0]:
             y[k] = list(score[k] for score in scores)
     return y
Пример #5
0
def calculate_rouge(
    pred_lns: List[str],
    tgt_lns: List[str],
    use_stemmer=True,
    rouge_keys=ROUGE_KEYS,
    return_precision_and_recall=False,
    bootstrap_aggregation=True,
    newline_sep=True,
) -> Dict:

    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()
    for pred, tgt in zip(tgt_lns, pred_lns):
        # rougeLsum expects "\n" separated sentences within a summary
        if newline_sep:
            pred = add_newline_to_end_of_each_sentence(pred)
            tgt = add_newline_to_end_of_each_sentence(tgt)
        scores = scorer.score(pred, tgt)
        aggregator.add_scores(scores)

    if bootstrap_aggregation:
        result = aggregator.aggregate()
        if return_precision_and_recall:
            return extract_rouge_mid_statistics(result)  # here we return dict
        else:
            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}

    else:
        return aggregator._scores  # here we return defaultdict(list)
Пример #6
0
    def validation_step(self, batch, batch_nb):
        for p in self.model.parameters():
            p.requires_grad = False

        outputs = self.forward(*batch)
        vloss = outputs[0]
        input_ids, output_ids = batch
        input_ids, attention_mask = self._prepare_input(input_ids)
        generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                            use_cache=True, max_length=self.args.max_output_len,
                                            num_beams=1)
        generated_str = self.tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True)
        gold_str = self.tokenizer.batch_decode(output_ids.tolist(), skip_special_tokens=True)
        scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=False)
        rouge1 = rouge2 = rougel = rougelsum = 0.0
        for ref, pred in zip(gold_str, generated_str):
            score = scorer.score(ref, pred)
            rouge1 += score['rouge1'].fmeasure
            rouge2 += score['rouge2'].fmeasure
            rougel += score['rougeL'].fmeasure
            rougelsum += score['rougeLsum'].fmeasure
        rouge1 /= len(generated_str)
        rouge2 /= len(generated_str)
        rougel /= len(generated_str)
        rougelsum /= len(generated_str)

        return {'vloss': vloss,
                'rouge1': vloss.new_zeros(1) + rouge1,
                'rouge2': vloss.new_zeros(1) + rouge2,
                'rougeL': vloss.new_zeros(1) + rougel,
                'rougeLsum': vloss.new_zeros(1) + rougelsum, }
Пример #7
0
def evaluate_all(model_type: str):
    gold_dir = "output/" + model_type + "/gold/"
    scorer = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
    all_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}

    for file in glob.glob(gold_dir + "*.txt"):
        gold = " ".join(open(file, 'r').readlines())
        model = " ".join(open(file.replace("gold", "model"), "r").readlines())
        scores = scorer.score(gold, model)
        for type, score in scores.items():
            all_scores[type].append(score)

    all_averages = {}
    for type in all_scores.keys():
        all_averages[type] = {}
        all_averages[type]["f_mean"] = np.mean(
            [score.fmeasure for score in all_scores[type]])
        all_averages[type]["f_sd"] = np.std(
            [score.fmeasure for score in all_scores[type]])
        all_averages[type]["recall_mean"] = np.mean(
            [score.recall for score in all_scores[type]])
        all_averages[type]["recall_sd"] = np.std(
            [score.recall for score in all_scores[type]])
        all_averages[type]["precision_mean"] = np.mean(
            [score.precision for score in all_scores[type]])
        all_averages[type]["precision_sd"] = np.std(
            [score.precision for score in all_scores[type]])
    print(all_averages)
    return all_scores, all_averages
Пример #8
0
def compute_rouge_data(path="model_output.txt", scoring="rouge1"):
    scorer = rouge_scorer.RougeScorer([scoring])
    fscore = []
    precision = []
    recall = []

    with open(path, "r") as f:
        testing_results = json.load(f)

    for i, element in enumerate(testing_results):
        scores = scorer.score(TOKENIZER.DecodeIds(element['prediction']),
                              TOKENIZER.DecodeIds(element['reference']))
        precision.append(scores[scoring][0])
        recall.append(scores[scoring][1])
        fscore.append(scores[scoring][2])

    return {
        'Max': {
            'Recall': (max(recall), np.argmax(recall)),
            'Precision': (max(precision), np.argmax(precision)),
            'Fscore': (max(fscore), np.argmax(fscore))
        },
        'Min': {
            'Recall': (min(recall), np.argmin(recall)),
            'Precision': (min(precision), np.argmin(precision)),
            'Fscore': (min(fscore), np.argmin(fscore))
        }
    }
Пример #9
0
def rouge_evauation(y_true, y_pred, tokenizer):
    rouge1_precision = []
    rouge1_recall = []
    rouge1_f = []
    rouge2_precision = []
    rouge2_recall = []
    rouge2_f = []
    rougeL_precision = []
    rougeL_recall = []
    rougeL_f = []
    for i, j in zip(y_true, y_pred):

        y_t = tokenizer.decode(rem_zero(i))
        y_p = tokenizer.decode(rem_zero(j))

        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                          use_stemmer=True)
        scores = scorer.score(y_t, y_p)

        rouge1_precision.append(scores['rouge1'].precision)
        rouge1_recall.append(scores['rouge1'].recall)
        rouge1_f.append(scores['rouge1'].fmeasure)

        rouge2_precision.append(scores['rouge2'].precision)
        rouge2_recall.append(scores['rouge2'].recall)
        rouge2_f.append(scores['rouge2'].fmeasure)

        rougeL_precision.append(scores['rougeL'].precision)
        rougeL_recall.append(scores['rougeL'].recall)
        rougeL_f.append(scores['rougeL'].fmeasure)

    return scores, rouge1_precision, rouge1_recall, rouge1_f, rouge2_precision, rouge2_recall, rouge2_f, rougeL_precision, rougeL_recall, rougeL_f
Пример #10
0
    def rouge(self, refs, preds):
        """
        Returns `t5` style ROUGE scores. See the related implementation:
        https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68

        :param refs:
            A `list` of reference `strs`.
        :param preds:
            A `list` of predicted `strs`.
        """
        rouge_types = ["rouge1", "rouge2", "rougeLsum"]
        scorer = rouge_scorer.RougeScorer(rouge_types)

        # Add newlines between sentences to correctly compute `rougeLsum`.

        def _prepare_summary(summary):
            summary = summary.replace(" . ", ".\n")
            return summary

        # Accumulate confidence intervals.
        aggregator = scoring.BootstrapAggregator()
        for ref, pred in zip(refs, preds):
            ref = _prepare_summary(ref)
            pred = _prepare_summary(pred)
            aggregator.add_scores(scorer.score(ref, pred))
        result = aggregator.aggregate()
        return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
def bbc_dataset_rouge():
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                      use_stemmer=True)
    test_data = get_bbc_dataset_files()
    generic_baseline, generic_improved, generic_improved_redundancy_removal = None, None, None
    for v in test_data:
        text = v['full']
        summary = v['summary']
        scores = scorer.score(summarize(text, ratio=0.4), summary)
        generic_baseline = update_score(generic_baseline, scores)
        scores = scorer.score(
            improved_summarize(text, ratio=0.4, redundancy_removal=False),
            summary)
        generic_improved = update_score(generic_improved, scores)
        scores = scorer.score(
            improved_summarize(text, ratio=0.4, redundancy_removal=True),
            summary)
        generic_improved_redundancy_removal = update_score(
            generic_improved_redundancy_removal, scores)
    total_news = len(test_data)
    return {
        'generic_baseline':
        get_score_avg(generic_baseline, total_news),
        'generic_improved_redundancy_removal':
        get_score_avg(generic_improved_redundancy_removal, total_news),
        'generic_improved':
        get_score_avg(generic_improved, total_news)
    }
def calc_rouge_scores(pred_summaries, gold_summaries, 
                                 keys=['rouge1', 'rougeL'], use_stemmer=True):
    #Calculate rouge scores
    scorer = rouge_scorer.RougeScorer(keys, use_stemmer= use_stemmer)
    
    n = len(pred_summaries)
    
    scores = [scorer.score(pred_summaries[j], gold_summaries[j]) for 
              j in range(n)] 
    
    dict_scores={}                                                            
    for key in keys:
        dict_scores.update({key: {}})
        
    
    for key in keys:
        
        precision_list = [scores[j][key][0] for j in range(len(scores))]
        recall_list = [scores[j][key][1] for j in range(len(scores))]
        f1_list = [scores[j][key][2] for j in range(len(scores))]

        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        f1 = np.mean(f1_list)
        
        dict_results = {'recall': recall, 'precision': precision, 'f1': f1}
        
        dict_scores[key] = dict_results
        
    return dict_scores
Пример #13
0
def cal_rouge_score(filename1, filename2):
    f1 = open(filename1, 'r')
    f2 = open(filename2, 'r')
    summary = f1.readlines()
    reference = f2.readlines()
    """
    for i in range(len(summary)):
        summary[i] = re.sub('[%s]' % re.escape(string.punctuation), '', summary[i])
        reference[i] = re.sub('[%s]' % re.escape(string.punctuation), '', reference[i])
    """
    for i in range(len(summary)):
        summary[i] = summary[i].strip().replace('<t>', '').replace('</t>',
                                                                   '').strip()
        reference[i] = reference[i].strip().replace('<t>',
                                                    '').replace('</t>',
                                                                '').strip()
    print(len(summary))
    print(len(reference))
    #summary = summary[1:1000]
    #reference = reference[1:1000]
    scorer = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    scores = {"rouge1": [], "rouge2": [], "rougeL": [], "rougeLsum": []}
    for i in range(len(summary)):
        s = scorer.score(summary[i], reference[i])
        if i % 1000 == 0:
            print(i)
        for k, v in s.items():
            scores[k].append(s[k].fmeasure)

    for k, v in scores.items():
        scores[k] = sum(v) / len(v)
    return scores
def test_rouge(temp_dir, cand, ref):
    candidates = [line.strip() for line in open(cand, encoding='utf-8')]
    references = [line.strip() for line in open(ref, encoding='utf-8')]
    print("test number", len(candidates))
    # print(len(references))
    assert len(candidates) == len(references)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    results = { 'rouge1' : {'precision':0, 'recall':0, 'fmeasure':0},
    			'rouge2' : {'precision':0, 'recall':0, 'fmeasure':0},
    			'rougeL' : {'precision':0, 'recall':0, 'fmeasure':0}
    			}

    cnt = len(candidates)
    for i in range(cnt):
    	# print(candidates[i])
    	# print(references[i])
    	scores = scorer.score(candidates[i], references[i])
    	results['rouge1']['precision'] += scores['rouge1'].precision
    	results['rouge1']['recall'] += scores['rouge1'].recall
    	results['rouge1']['fmeasure'] += scores['rouge1'].fmeasure

    	results['rouge2']['precision'] += scores['rouge2'].precision
    	results['rouge2']['recall'] += scores['rouge2'].recall
    	results['rouge2']['fmeasure'] += scores['rouge2'].fmeasure
    	
    	results['rougeL']['precision'] += scores['rougeL'].precision
    	results['rougeL']['recall'] += scores['rougeL'].recall
    	results['rougeL']['fmeasure'] += scores['rougeL'].fmeasure
    
    for k,v in results.items():
    	for key, value in v.items():
    		results[k][key] = value/cnt

    return results
Пример #15
0
    def _compute(self,
                 predictions,
                 references,
                 rouge_types=None,
                 use_agregator=True,
                 use_stemmer=False):
        if rouge_types is None:
            rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                          use_stemmer=use_stemmer)
        if use_agregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_agregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_agregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = list(score[key] for score in scores)

        return result
Пример #16
0
def evaluate_on_rouge_scores(targets: List, preds: List) -> Dict:

    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                      use_stemmer=True)
    scores = [
        scorer.score(target, pred) for target, pred in zip(targets, preds)
    ]

    rouge1_f1, rouge2_f1, rougeL_f1 = 0.0, 0.0, 0.0

    # TODO: Very ugly - refactor needed
    for score in scores:
        for k, v in score.items():
            if k == "rouge1":
                rouge1_f1 += v.fmeasure
            if k == "rouge2":
                rouge2_f1 += v.fmeasure
            if k == "rougeL":
                rougeL_f1 += v.fmeasure

    eval_dict = {
        "rouge1": rouge1_f1 / len(scores),
        "rouge2": rouge2_f1 / len(scores),
        "rougeL": rougeL_f1 / len(scores),
    }

    return eval_dict
Пример #17
0
    def _compute(self,
                 predictions,
                 references,
                 use_agregator=True,
                 use_stemmer=False):

        rouge_types = ["rougeL"]
        predictions = " ".join([str(p) for p in predictions])
        references = " ".join([str(r) for r in references])

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                          use_stemmer=use_stemmer)
        if use_agregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_agregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_agregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = list(score[key] for score in scores)

        return result
Пример #18
0
def text_eval(encoder,
              features_iter,
              model_dir,
              global_step,
              eval_tag,
              enable_logging,
              inputs_pattern="^inputs[0-9]*$",
              targets_key="targets",
              predictions_key="outputs",
              additional_keys=(),
              num_reserved=None):
  """Evaluates a set of text targets/predictions."""
  decode_fn = lambda x: ids2str(encoder, x, num_reserved)
  scorers_dict = {}
  scorers_dict[_ROUGE_METRIC] = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True)
  scorers_dict[_BLEU_METRIC] = bleu_scorer.BleuScorer()
  scorers_dict[_REPETITION_METRIC] = repetition_scorer.RepetitionScorer(
    ["regs1", "regs2", "regs3", "regsTCR"])
  scorers_dict[_LENGTH_METRIC] = length_scorer.LengthScorer(["word", "char"])
  aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorers_dict}

  with LogWriter(additional_keys, model_dir, global_step, eval_tag,
                 enable_logging) as log_writer:
    for i, features in enumerate(features_iter):
      inputs_list = []
      for k in sorted(features):
        if re.match(inputs_pattern, k):
          single_inputs = decode_matrix(decode_fn, features[k])
          if isinstance(single_inputs, list):
            inputs_list.extend(single_inputs)
          else:
            inputs_list.append(single_inputs)

      inputs = "\n".join(inputs_list)
      targets = decode_fn(features[targets_key])
      preds = decode_fn(features[predictions_key])
      text_dict = {
        "inputs": inputs_list,
        "targets": targets,
        "predictions": preds
      }

      for key in additional_keys:
        if key == "selected_ids":
          text_dict[key] = decode_selected_indices(decode_fn, features)
        else:
          text_dict[key] = decode_matrix(decode_fn, features[key])

      log_writer.write(text_dict, i)

      for key, scorer in scorers_dict.items():
        scores_i = scorer.score(targets, preds)
        aggregators_dict[key].add_scores(scores_i)

  aggregates_dict = {k: v.aggregate() for k, v in aggregators_dict.items()}
  length_histograms = scorers_dict[_LENGTH_METRIC].histograms(as_string=True)
  _write_aggregates(model_dir, global_step, eval_tag, aggregates_dict,
                    length_histograms)
  _write_aggregate_summaries(model_dir, global_step, eval_tag, aggregates_dict)
Пример #19
0
    def run_rouge(self):
        ''' Computes the ROUGE score between the set of hypothesis 
            and reference summaries.
        '''
        print('\n===== ROUGE =====\n')
        rouge = rouge_scorer.RougeScorer(self.rouge_metrics, use_stemmer=True)

        for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths):
            self.load_summs(hyps_path, refs_path)
            hyps, refs = self.hyps, self.refs

            start_time = time.time()
            scores = []
            for i, (c, r) in tqdm(enumerate(zip(hyps, refs))):
                c = c.replace('. ', '\n')
                r = r.replace('. ', '\n')
                ref = text_normalization(c)
                hyp = text_normalization(r)
                rouge_scores = rouge.score(r, c)
                scores.append(
                    [rouge_scores[m].fmeasure for m in self.rouge_metrics])

            self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path,
                               ROUGE_METRICS] = scores
            self.save_temp_csv()
Пример #20
0
def simple_rouge(generated_answers, possible_aswers_list):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rouge3'],
                                      use_stemmer=True)
    rouge_1_list = []
    rouge_2_list = []
    rouge_3_list = []
    for ind, elem in enumerate(generated_answers):
        if (elem != "We can't recognize objects for comparision"):
            generated_answer = generated_answers[ind]
            scores = [
                scorer.score(generated_answer, answ)
                for answ in possible_aswers_list[ind]
            ]
            sorted_scores_1 = sorted(scores,
                                     key=lambda x: x['rouge1'].fmeasure,
                                     reverse=True)
            sorted_scores_2 = sorted(scores,
                                     key=lambda x: x['rouge2'].fmeasure,
                                     reverse=True)
            sorted_scores_3 = sorted(scores,
                                     key=lambda x: x['rouge3'].fmeasure,
                                     reverse=True)
            rouge_1_list.append(sorted_scores_1[0]['rouge1'])
            rouge_2_list.append(sorted_scores_2[0]['rouge2'])
            rouge_3_list.append(sorted_scores_3[0]['rouge3'])
    return {
        'rouge1': rouge_1_list,
        'rouge2': rouge_2_list,
        'rouge3': rouge_3_list
    }
Пример #21
0
    def __init__(
            self,
            model_name_or_path: str = 'albert-large-uncased',  # './vocab.txt'
            datasets_loader: str = 'race',  # 'RACELocalLoader.py'
            task_name: str = 'all',
            max_seq_length: int = 512,
            train_batch_size: int = 32,
            eval_batch_size: int = 32,
            num_workers: int = 8,
            num_preprocess_processes: int = 8,
            use_sentence_selection: bool = True,
            best_k_sentences: int = 5,
            **kwargs):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.dataset_loader = datasets_loader
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.num_workers = num_workers
        self.num_preprocess_processes = num_preprocess_processes
        self.use_sentence_selection = use_sentence_selection
        self.best_k_sentences = best_k_sentences

        self.tokenizer = AlbertTokenizerFast.from_pretrained(
            self.model_name_or_path, use_fast=True, do_lower_case=True)
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'],
                                               use_stemmer=True)
        self.dataset = None
    def __init__(self, metric):
        self.metric = metric

        if self.metric == 'rouge':
            # TODO: does this package do lower case, normalization
            self.scorer = rouge_scorer.RougeScorer(
                ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
Пример #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gold_file")
    parser.add_argument("--predictions_file")
    parser.add_argument("--output_file", required=False)
    args = parser.parse_args()

    # Load the Rouge metric
    metric = rouge_scorer.RougeScorer(['rougeL'])

    # Load the gold and predictions file
    gold_lines = {l['id']: l for l in jsonlines.open(args.gold_file)}
    pred_lines = {l['id']: l for l in jsonlines.open(args.predictions_file)}

    scores = []
    for id in gold_lines:
        if id not in pred_lines:
            print(f'Could not find a generated summary for ID {id}. '
                  f'Assigning a score of 0 for this instance.')
            scores.append(0)
        else:
            score = metric.score(
                gold_lines[id]['summary'],
                pred_lines[id]['generated_summary'])['rougeL'].fmeasure
            scores.append(score)
            pred_lines[id]['rougeL'] = score

    print(f'Rouge-L score: {sum(scores)/len(scores)*100:.2f}')

    # If an output_file is provided, we write out the instance-wise
    # rouge scores to file
    if args.output_file:
        with open(args.output_file, 'w', encoding='utf-8') as f:
            for l in pred_lines.values():
                f.write(json.dumps(l, ensure_ascii=False) + '\n')
Пример #24
0
 def compute_scores(self):
     self.preprocess()
     
     scorer = rouge_scorer.RougeScorer([self.rouge_type[0]], use_stemmer=True)
     self._scores = list(map(lambda p,x,y,z: max(scorer.score(p,x)[self.rouge_type[0]][self.rouge_type[1]], 
                                                 scorer.score(p,y)[self.rouge_type[0]][self.rouge_type[1]],
                                                 scorer.score(p,z)[self.rouge_type[0]][self.rouge_type[1]]), 
                                                 self._predictions, self._references[0], self._references[1], self._references[2]))
Пример #25
0
    def __init__(self, model_name="t5-small", **config_kw):
        self.config = T5ModelConfig(**config_kw)
        self.tokenizer = transformers.T5Tokenizer.from_pretrained(model_name)
        self.model = transformers.TFT5ForConditionalGeneration.from_pretrained(
            model_name, output_hidden_states=True, output_attentions=True)

        # TODO(gehrmann): temp solution for ROUGE.
        self._scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
def computeROUGE(originalTextList, paraphraseTextList):
    outputScores = []
    for originalSent, paraphraseSent in zip(originalTextList, paraphraseTextList):
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
        scores = scorer.score(originalSent, paraphraseSent)
        outputScores.append(scores['rougeL'][0])

    return outputScores
Пример #27
0
def calc_rouge_score(orig_text: str, gen_text: str):
    """
        :param orig_text: original speech as a string
        :param gen_text: generated speech as a string
        :return: rouge score between the two speeches (f1 measure)
        """
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(orig_text, gen_text)
    return scores["rouge1"][2]
Пример #28
0
 def __init__(self, sequence_transform_fn, batch_size):
     self.scorer = rouge_scorer.RougeScorer(
         [f'rouge{i}' for i in ROUGE_VARIATIONS],
         use_stemmer=True,
     )
     self.sequence_transform_fn = sequence_transform_fn
     self.batch_size = batch_size
     self.target_batches = []
     self.prediction_batches = []
Пример #29
0
def rouge(references, pred):
    rouge_names = ['rouge1', 'rouge2', 'rougeL']
    scorer = rouge_scorer.RougeScorer(rouge_names, use_stemmer=True)
    scores = {rouge_name: 0 for rouge_name in rouge_names}
    for reference in references:
        ref_scores = scorer.score(reference, pred)
        for rouge_name, cur_score in scores.items():
            scores[rouge_name] = max(scores[rouge_name],
                                     ref_scores[rouge_name].fmeasure)
    return scores
Пример #30
0
def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer=True) -> Dict:
    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()

    for reference_ln, output_ln in zip(reference_lns, output_lns):
        scores = scorer.score(reference_ln, output_ln)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()
    return {k: v.mid.fmeasure for k, v in result.items()}