示例#1
0
def run_bertscore(mt: list,
                  ref: list,
                  model_type="xlm-roberta-base",
                  language=False,
                  idf=False) -> (list, list, list):
    """ Runs BERTScores and returns precision, recall and F1 BERTScores ."""
    if language:
        precison, recall, f1 = bert_score.score(
            cands=mt,
            refs=ref,
            idf=idf,
            batch_size=32,
            lang=language,
            rescale_with_baseline=False,
            verbose=True,
            nthreads=4,
        )
    else:
        precison, recall, f1 = bert_score.score(
            cands=mt,
            refs=ref,
            idf=idf,
            batch_size=32,
            model_type=model_type,
            rescale_with_baseline=False,
            verbose=True,
            nthreads=4,
        )
    return precison, recall, f1
示例#2
0
def read_a_business(review_list):

    most_useful_adj_list = []
    most_useful_noun_list = []

    most_useful_adj_counter = collections.Counter()
    most_useful_noun_counter = collections.Counter()

    pair_list = []
    for text in review_list:
        adj_list, noun_list = pos_tagging(text)

        if adj_list == [] or noun_list == []:
            continue

        text = pre_processing(text)

        P_noun, _, _ = score(noun_list,
                             len(noun_list) * [text], "bert-base-uncased")
        P_adj, _, _ = score(adj_list,
                            len(adj_list) * [text], "bert-base-uncased")

        adj_index = torch.argmax(P_adj).item()
        noun_index = torch.argmax(P_noun).item()

        pair = tuple((adj_list[adj_index], noun_list[noun_index]))
        pair_list.append(pair)

    return pair_list
示例#3
0
def bert_agreement(pred, ref):
    (P, R, F1), hash_code = score(pred,
                                  ref,
                                  lang="en",
                                  verbose=True,
                                  return_hash=True,
                                  idf=True)
    # (P_pivot, R_pivot, F1_pivot), _ = score(concept, ref, lang="en", verbose=True, return_hash=True, idf=False)
    print(hash_code)
    # print(f"System level P score: {(P.mean()-P_pivot.mean())*100:.2f}")
    # print(f"System level R score: {(R.mean()-R_pivot.mean())*100:.2f}")
    # print(f"System level F1 score: {(F1.mean()-F1_pivot.mean())*100:.2f}")

    print(f"System level P score: {(P.mean())*100:.2f}")
    print(f"System level R score: {(R.mean())*100:.2f}")
    print(f"System level F1 score: {(F1.mean())*100:.2f}")

    (P_pivot, R_pivot, F1_pivot), _ = score(concept,
                                            ref,
                                            lang="en",
                                            verbose=True,
                                            return_hash=True,
                                            idf=False)
    print(f"Base System level P score: {(P_pivot.mean())*100:.2f}")
    print(f"Base System level R score: {(R_pivot.mean())*100:.2f}")
    print(f"Base System level F1 score: {(F1_pivot.mean())*100:.2f}")
示例#4
0
    def _calc_metrics_info(self, generate_corpus, reference_corpus):

        transformers.tokenization_utils.logger.setLevel(logging.ERROR)
        transformers.configuration_utils.logger.setLevel(logging.ERROR)
        transformers.modeling_utils.logger.setLevel(logging.ERROR)

        generate_corpus = [
            self._preprocess(generate_sentence)
            for generate_sentence in generate_corpus
        ]
        reference_corpus = [
            self._preprocess(reference_sentence)
            for reference_sentence in reference_corpus
        ]

        result = {}
        if self.model == None:
            P, R, F1 = score(generate_corpus,
                             reference_corpus,
                             lang=self.lang,
                             verbose=False)
        else:
            if self.num_layers == None:
                raise ValueError("num_layer should be an integer")
            P, R, F1 = score(generate_corpus,
                             reference_corpus,
                             model_type=self.model,
                             num_layers=self.num_layers,
                             verbose=False)
        result['bert-score'] = F1.tolist()
        return result
def bert_pairwise_cos_sim(src, tgt, idf=False):
    from bert_score import score
    if idf:
        p, _, _ = score([src], [tgt], lang='en', idf=True)
        return p.item()

    p, _, _ = score([src], [tgt], lang='en')
    return p.item()
示例#6
0
def get_bertscore(cand_sentences, ref_sentences, model, layer, language, scoring_approach):
    """
    BERTScore metric, from the paper https://arxiv.org/pdf/1904.09675.pdf

    Args:
        - :param: `cand_sentences` (list of str): candidate summary sentences
        - :param: `ref_sentences` (list of str): reference summary sentences
        - :param: `model` (str): the specific bert model to use
        - :param: `Layer` (int): the layer of representation to use.
        - :param: `language` (str): language of the inputs.
                  performance may vary for non-english langauges on english pre-trained bert models     
        - :param: `scoring_approach` (str): defines whether to use the argmax or mean-based scoring approaches.
                  argmax returns the score of the highest scoring reference sentence for each candidate sentence 
                  mean-based returns the mean of all reference sentence scores for each candidate sentence 

    Return:
        - :param: precision score (float): precision score for the candidate summary 
        - :param: recall score (float): recall score for the candidate summary 
        - :param: f1 score (float): f1 score for the candidate summary 
    """
    
    final_precision_scores = []
    final_recall_scores = []
    final_f1_scores = []

    if scoring_approach == 'argmax':
        for cand_sent in cand_sentences:
            p, r, f1 = bert_score.score([cand_sent], [ref_sentences], model_type = model, num_layers = layer, lang = language) # BERTscore defaults to taking the argmax value when multiple references are given for 1 candidate sentence
            final_precision_scores.append(p.tolist()[0])
            final_recall_scores.append(r.tolist()[0])
            final_f1_scores.append(f1.tolist()[0])

    elif scoring_approach == 'mean':
        for cand_sent in cand_sentences:
            precision_scores = 0.0
            recall_scores = 0.0
            f1_scores = 0.0
            for ref_sent in ref_sentences:
                p, r, f1 = bert_score.score([cand_sent], [ref_sent], model_type = model, num_layers = layer, lang = language)  # BERTscore is the argmax of each word-comparision, we take the mean of the total argmax score for each candidate sentence
                precision_scores += p.tolist()[0]
                recall_scores += r.tolist()[0]
                f1_scores += f1.tolist()[0]
            
            # Divide with len(ref_sentences) to get the mean BERTscore for each candidate sentence
            final_precision_scores.append(precision_scores / len(ref_sentences))
            final_recall_scores.append(recall_scores / len(ref_sentences))
            final_f1_scores.append(f1_scores / len(ref_sentences))
    
    else:
        print("scoring_approach parameter must be defined as either 'argmax' or 'mean'. Check the README for descriptions of each.")
        return None

    # Final score is simply the average of the precision, recall and f1 score of each sentence in the candidate summary
    precision_score = sum(final_precision_scores)  / len(final_precision_scores)
    recall_score = sum(final_recall_scores)  / len(final_recall_scores)
    f1_score = sum(final_f1_scores)  / len(final_f1_scores)

    return precision_score, recall_score, f1_score
    def calc_scores(self):
        super().calc_scores()

        # write input_tsv
        self.create_input_tsv()

        # read data
        with open(self.config['input_tsv'], 'r+', encoding='utf-8') as f_in:
            cands = []
            refs = []
            reader = csv.DictReader(f_in, dialect='excel-tab')  # tsv reader
            for row in reader:
                refs.append(row['sentence1'])
                cands.append(row['sentence2'])

        # calc scores
        P, R, F = bert_score.score(cands,
                                   refs,
                                   idf=False,
                                   lang='en',
                                   rescale_with_baseline=True)

        # write scores
        output_str = '\n'.join(['{:.5f}'.format(e) for e in F.tolist()]) + '\n'
        with open(self.config['cache_file'], 'w', encoding='utf-8') as f_out:
            f_out.write(output_str)
示例#8
0
def official_bert_score(fidlist, reflist, predlist):
    from bert_score import score
    refs = list()
    preds = list()
    p_bert = dict()
    r_bert = dict()
    f1_bert = dict()
    for ref, pred in zip(reflist, predlist):
        ref = ' '.join(ref).strip()
        pred = ' '.join(pred).strip()
        refs.append(ref)
        preds.append(pred)
    p, r, f1 = score(preds, refs, lang='en', rescale_with_baseline=True)
    for fid, pscore, rscore, f1score in zip(fidlist, p.numpy(), r.numpy(), f1.numpy()):
        p_bert[fid] = pscore
        r_bert[fid] = rscore
        f1_bert[fid] = f1score
    pickle.dump(p_bert, open(outpath+'/pbert.pkl', 'wb'))
    pickle.dump(r_bert, open(outpath+'/rbert.pkl', 'wb'))
    pickle.dump(f1_bert, open(outpath+'/f1bert.pkl', 'wb'))
    avg_p = sum(list(p_bert.values()))/len(p_bert)
    avg_r = sum(list(r_bert.values()))/len(r_bert)
    avg_f1 = sum(list(f1_bert.values()))/len(f1_bert)
    
    ret = ('for %s functions\n' % (len(predlist)))
    ret+= 'precision bertscore using official bert-score repo %s\n' % (round(avg_p*100, 2))
    ret+= 'recall bertscore using official bert-score repo %s\n' % (round(avg_r*100, 2))
    ret+= 'f1 bertscore using official bert-score repo %s\n' % (round(avg_f1*100, 2))
    return ret
示例#9
0
def read_a_business(review_list):

    most_useful_adj_list = collections.Counter()
    most_useful_noun_list = collections.Counter()

    for text in text_list:
        adj_list, noun_list = pos_tagging(text)
        P_noun, _, _ = score(noun_list,
                             len(noun_list) * [text], "bert-base-uncased")
        P_adj, _, _ = score(adj_list,
                            len(adj_list) * [text], "bert-base-uncased")

        most_useful_adj_list.update(P_adj.index(max(P_adj)))
        most_useful_noun_list.update(P_noun.index(max(P_noun)))

    return most_useful_adj_list, most_useful_noun_list
示例#10
0
    def compute_score(self, gts, res):

        assert(gts.keys() == res.keys())
        imgIds = gts.keys()

        hyp_input = []
        ref_input = []
        same_indices = []
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) >= 1)

            hyp_input += [hypo[0]] * len(ref)
            ref_input += ref
            same_indices.append(len(ref_input))

        p, r, f_scores = score(hyp_input, ref_input, model_type="bert-base-uncased")
 
        prev_idx = 0
        aggreg_f1_scores = []
        for idx in same_indices:
            aggreg_f1_scores.append(f_scores[prev_idx: idx].mean().cpu().item())
            prev_idx = idx

        return sum(aggreg_f1_scores)/len(aggreg_f1_scores), aggreg_f1_scores
示例#11
0
    def test_idf_score_rescale_fast_tokenizer(self):
        (P, R, F), hash_code = bert_score.score(
            cands,
            refs,
            model_type="roberta-large",
            num_layers=17,
            idf=True,
            batch_size=3,
            return_hash=True,
            lang="en",
            rescale_with_baseline=True,
            use_fast_tokenizer=True,
        )
        self.assertAreTensors(P, R, F)
        self.assertTensorsAlmostEqual(
            P, [0.9060347080230713, 0.8529528975486755, 0.4002779722213745])
        self.assertTensorsAlmostEqual(
            R, [0.907024621963501, 0.8212453722953796, 0.514383852481842])
        self.assertTensorsAlmostEqual(
            F, [0.9066815376281738, 0.8373198509216309, 0.45761245489120483])

        self.assertEqual(
            hash_code,
            f"roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled_fast-tokenizer",
        )
示例#12
0
    def compute_score(self, reference_sents, generated_sents, verbose=False):
        """
        Main function to compute CIDEr score
        :param  res (list) : list of dictionaries with image ic and tokenized hypothesis / candidate sentence
                gts (dict)  : dictionary with key <image id> and value <tokenized reference sentence>
        :return: cider (float) : computed CIDEr score for the corpus
        """
        if verbose:
            print("======== generated sentences ========\n", generated_sents)
            print("======== reference sentences ========\n", reference_sents)
        output = score(generated_sents,
                       reference_sents,
                       lang=self.lang,
                       verbose=verbose,
                       rescale_with_baseline=True,
                       idf=True)
        precision, recall, f1_scores = output

        if self.metric == 'recall':
            scores = recall
        elif self.metric == 'precision':
            scores = precision
        else:
            scores = f1_scores

        scores = np.array(scores)
        return scores.mean(), scores
示例#13
0
    def test_idf_score(self):
        (P, R, F), hash_code = bert_score.score(cands,
                                                refs,
                                                model_type='roberta-large',
                                                num_layers=17,
                                                idf=True,
                                                batch_size=3,
                                                return_hash=True)
        # print(P.tolist(), R.tolist(), F.tolist())

        self.assertTrue(torch.is_tensor(P))
        self.assertTrue(torch.is_tensor(R))
        self.assertTrue(torch.is_tensor(F))
        self.assertEqual(
            hash_code,
            f'roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})'
        )
        self.assertTrue((P - torch.tensor(
            [0.9837872385978699, 0.9754738807678223, 0.8947395086288452])
                         ).abs_().max() < EPS)
        self.assertTrue((R - torch.tensor(
            [0.9827190637588501, 0.9697767496109009, 0.9172918796539307])
                         ).abs_().max() < EPS)
        self.assertTrue((F - torch.tensor(
            [0.9832529425621033, 0.972616970539093, 0.9058753848075867])
                         ).abs_().max() < EPS)
示例#14
0
    def test_idf_score_rescale(self):
        (P, R, F), hash_code = bert_score.score(cands,
                                                refs,
                                                model_type='roberta-large',
                                                num_layers=17,
                                                idf=True,
                                                batch_size=3,
                                                return_hash=True,
                                                rescale_with_baseline=True)
        # print(P.tolist(), R.tolist(), F.tolist())

        self.assertTrue(torch.is_tensor(P))
        self.assertTrue(torch.is_tensor(R))
        self.assertTrue(torch.is_tensor(F))
        self.assertEqual(
            hash_code,
            f'roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled'
        )
        self.assertTrue((P - torch.tensor(
            [0.903778135776520, 0.854439020156860, 0.375287383794785])
                         ).abs_().max() < EPS)
        self.assertTrue((R - torch.tensor(
            [0.897446095943451, 0.820639789104462, 0.509167850017548])
                         ).abs_().max() < EPS)
        self.assertTrue((F - torch.tensor(
            [0.900772094726562, 0.837753534317017, 0.442304641008377])
                         ).abs_().max() < EPS)
示例#15
0
文件: utils.py 项目: oriern/SuperPAL
def calculate_metric_scores(cands, refs):
    """ calculate Rouge-1 precision, Bert precision
    and Entailment Scores
    """
    # calculate rouge-1 precision
    rouge = Rouge()
    rouge1_p = []
    for r, c in tqdm(zip(refs, cands)):
        r = " ".join(list(nlp_parser.tokenize(r))).lower()
        c = " ".join(list(nlp_parser.tokenize(c))).lower()
        scores = rouge.get_scores(c, r)[0]
        rouge1_p.append(round(scores['rouge-1']['p'], 4))
    # calculate bert precision
    P, _, _ = score(cands, refs, lang='en', verbose=True)
    P = [round(x, 4) for x in P.tolist()]
    ## calculate entaiment score
    url = 'http://localhost:5003/roberta_mnli_classifier'  # 'http://nlp1.cs.unc.edu:5003/roberta_mnli_classifier'
    mnli_data = []
    for p, h in zip(refs, cands):
        mnli_data.append({'premise': p, 'hypo': h})
    r = requests.post(url, json=mnli_data)
    results = r.json()
    ent_scores = []
    for ind, d in enumerate(results):
        ent_scores.append(float(d['entailment']))

    return rouge1_p, P, ent_scores
示例#16
0
    def test_score_rescale(self):
        (P, R, F), hash_code = bert_score.score(cands,
                                                refs,
                                                model_type='roberta-large',
                                                num_layers=17,
                                                idf=False,
                                                batch_size=3,
                                                return_hash=True,
                                                rescale_with_baseline=True)
        # print(P.tolist(), R.tolist(), F.tolist())

        self.assertTrue(torch.is_tensor(P))
        self.assertTrue(torch.is_tensor(R))
        self.assertTrue(torch.is_tensor(F))
        self.assertEqual(
            hash_code,
            f'roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled'
        )
        self.assertTrue((P - torch.tensor(
            [0.907000780105591, 0.900435566902161, 0.477955609560013])
                         ).abs_().max() < EPS)
        self.assertTrue((R - torch.tensor(
            [0.895456790924072, 0.841467440128326, 0.527785062789917])
                         ).abs_().max() < EPS)
        self.assertTrue((F - torch.tensor(
            [0.901383399963379, 0.871010780334473, 0.503565192222595])
                         ).abs_().max() < EPS)
示例#17
0
    def test_score(self):
        (P, R, F), hash_code = bert_score.score(cands,
                                                refs,
                                                model_type='roberta-large',
                                                num_layers=17,
                                                idf=False,
                                                batch_size=3,
                                                return_hash=True)
        # print(P.tolist(), R.tolist(), F.tolist())

        self.assertTrue(torch.is_tensor(P))
        self.assertTrue(torch.is_tensor(R))
        self.assertTrue(torch.is_tensor(F))
        self.assertEqual(
            hash_code,
            f'roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})'
        )
        self.assertTrue((P - torch.tensor(
            [0.9843302369117737, 0.9832239747047424, 0.9120386242866516])
                         ).abs_().max() < EPS)
        self.assertTrue((R - torch.tensor(
            [0.9823839068412781, 0.9732863903045654, 0.920428991317749])
                         ).abs_().max() < EPS)
        self.assertTrue((F - torch.tensor(
            [0.9833561182022095, 0.9782299995422363, 0.916214644908905])
                         ).abs_().max() < EPS)
示例#18
0
    def score(
            self, hypothesis: List[str], references: List[List[str]],
            tags: Optional[List[List[str]]] = None
    ) -> VizSeqScore:
        corpus_score, sent_scores, group_scores = None, None, None

        import bert_score as bs
        import langid
        import logging
        logging.getLogger('pytorch_pretrained_bert').setLevel(logging.WARNING)
        logging.getLogger('langid').setLevel(logging.WARNING)

        lang = langid.classify(references[0][0])[0]

        sent_scores = bs.score(
            hypothesis, references[0], nthreads=self.n_workers, lang=lang,
            verbose=self.verbose
        )[2].tolist()

        if self.corpus_level:
            corpus_score = np.mean(sent_scores)

        if tags is not None:
            tag_set = self._unique(tags)
            group_scores = {}
            for t in tag_set:
                indices = [i for i, cur in enumerate(tags) if t in cur]
                group_scores[t] = np.mean([sent_scores[i] for i in indices])

        return VizSeqScore.make(
                corpus_score=corpus_score, sent_scores=sent_scores,
                group_scores=group_scores
            )
示例#19
0
def bert_score(x_gt_pred_file, yy1_gt_pred_file, yy2_gt_pred_file,
               yy3_gt_pred_file):

    cands, refs = load_f(x_gt_pred_file, yy1_gt_pred_file, yy2_gt_pred_file,
                         yy3_gt_pred_file)
    P, R, F = score(cands, refs,
                    bert="bert-base-uncased")  #"pytorch-transformers")
    return P, R, F
示例#20
0
def bert_pairwise_cos_sim(sentences, idf=False):
    """BERTScore similarity func
    """
    src_len = len(sentences)

    refs = [[sentence] * src_len for sentence in sentences]
    refs = list(itertools.chain(*refs))
    hyps = sentences * src_len

    if idf:
        p, _, _ = score(refs, hyps, lang="en", idf=True)
        p = p.reshape(src_len, -1)
        return p.detach().numpy()

    p, _, _ = score(refs, hyps, lang="en")
    p = p.reshape(src_len, -1)
    return p.detach().numpy()
示例#21
0
 def test_multi_refs(self):
     cands = ['I like lemons.']
     refs = [['I am proud of you.', 'I love lemons.', 'Go go go.']]
     P_mul, R_mul, F_mul = bert_score.score(cands,
                                            refs,
                                            batch_size=3,
                                            return_hash=False,
                                            lang="en",
                                            rescale_with_baseline=True)
     P_best, R_best, F_best = bert_score.score(cands, [refs[0][1]],
                                               batch_size=3,
                                               return_hash=False,
                                               lang="en",
                                               rescale_with_baseline=True)
     self.assertTrue((P_mul - P_best).abs_().max() < EPS)
     self.assertTrue((R_mul - R_best).abs_().max() < EPS)
     self.assertTrue((F_mul - F_best).abs_().max() < EPS)
示例#22
0
def bert_pairwise_cos_sim(sentences, idf=False):
    import itertools
    from bert_score import score
    src_len = len(sentences)

    refs = [[sentence] * src_len for sentence in sentences]
    refs = list(itertools.chain(*refs))
    hyps = sentences * src_len

    if idf:
        p, _, _ = score(refs, hyps, lang='en', idf=True)
        p = p.reshape(src_len, -1)
        return p.detach().numpy()

    p, _, _ = score(refs, hyps, lang='en')
    p = p.reshape(src_len, -1)
    return p.detach().numpy()
示例#23
0
def bert_score_reward_no_stop_word(pred_str_list, pred_sent_2d_list,
                                   trg_str_list, trg_sent_2d_list, batch_size,
                                   device):
    stop_words = stopwords.words('english')
    # remove stop words
    pred_str_list = [w for w in pred_str_list if w not in stop_words]
    trg_str_list = [w for w in trg_str_list if w not in stop_words]
    P, R, F1 = score(' '.join(pred_str_list), ' '.join(trg_str_list))
    return F1.to(device)
def gen_bert_score(df, sent1, sent2, model_type, layer):

    p, r, f1 = bert_score.score(df[sent1].values.tolist(),
                                df[sent2].values.tolist(),
                                model_type=model_type,
                                num_layers=layer,
                                lang='en',
                                batch_size=256)

    return f1.tolist()
示例#25
0
def bert_score_compute(st_cands, st_ref, lang):
    cands = st_cands.split(".")
    refs = st_ref.split(".")
    P, R, F1 = score(cands,
                     refs,
                     lang=lang,
                     model_type="bert-base-multilingual-cased",
                     verbose=True)
    return round(float(P.mean()),
                 2), float(R.mean()), round(float(F1.mean()), 3)
示例#26
0
def cal_BERTScore(refer, candidate):
    # too slow, f**k it
    _, _, bert_scores = score(candidate,
                              refer,
                              lang='zh',
                              rescale_with_baseline=True)
    bert_scores = bert_scores.tolist()
    bert_scores = [
        0.5 if math.isnan(score) else score for score in bert_scores
    ]
    return np.mean(bert_scores)
示例#27
0
        def _run(self,
                 summaries_list: List[List[SummaryType]],
                 references_list: List[List[ReferenceType]]) -> List[List[MetricsDict]]:
            summaries_list = [[flatten(summary) for summary in summaries] for summaries in summaries_list]
            references_list = [[flatten(reference) for reference in references] for references in references_list]

            # Create the candidate and reference lists for passing to the scoring function
            input_candidates = []
            input_references = []
            empty_inputs = set()
            for i, (summaries, references) in enumerate(zip(summaries_list, references_list)):
                for j, summary in enumerate(summaries):
                    if len(summary) == 0:
                        empty_inputs.add((i, j))
                    else:
                        input_candidates.append(summary)
                        input_references.append(references)

            # Score the summaries
            precisions, recalls, f1s = bert_score.score(
                input_candidates,
                input_references,
                model_type=self.model_type,
                num_layers=self.num_layers,
                idf=False,
                nthreads=self.nthreads,
                batch_size=self.batch_size,
                lang=self.lang,
                verbose=self.verbose
            )

            # Remap the scores to the summaries
            index = 0
            metrics_lists = []
            for i, summaries in enumerate(summaries_list):
                metrics_lists.append([])
                for j, summary in enumerate(summaries):
                    if (i, j) in empty_inputs:
                        precision, recall, f1 = 0.0, 0.0, 0.0
                    else:
                        precision = precisions[index].item()
                        recall = recalls[index].item()
                        f1 = f1s[index].item()
                        index += 1

                    metrics_lists[-1].append(MetricsDict({
                        'bertscore': {
                            'precision': precision,
                            'recall': recall,
                            'f1': f1,
                        }
                    }))

            return metrics_lists
示例#28
0
 def evaluate_example(self, summary, reference):
     assert not self.idf, "idf mode not supported for evaluating a single example"
     if isinstance(reference, str):
         reference = [reference]
     all_preds, hash_code = bert_score.score([summary], reference, model_type=self.model_type, \
                                             num_layers=self.num_layers,
                                             verbose=self.verbose, idf=self.idf, batch_size=self.batch_size,
                                             nthreads=self.nthreads, lang=self.lang, return_hash=True,
                                             rescale_with_baseline=self.rescale_with_baseline)
     print(f"hash_code: {hash_code}")
     score = [{"bert_score_precision": p.cpu().item(), "bert_score_recall": r.cpu().item(), "bert_score_f1": \
              f.cpu().item()} for (p, r, f) in all_preds]
     return score
示例#29
0
 def test_multi_refs(self):
     cands = ["I like lemons."]
     refs = [["I am proud of you.", "I love lemons.", "Go go go."]]
     P_mul, R_mul, F_mul = bert_score.score(
         cands,
         refs,
         batch_size=3,
         return_hash=False,
         lang="en",
         rescale_with_baseline=True,
     )
     P_best, R_best, F_best = bert_score.score(
         cands,
         [refs[0][1]],
         batch_size=3,
         return_hash=False,
         lang="en",
         rescale_with_baseline=True,
     )
     self.assertTensorsAlmostEqual(P_mul, P_best)
     self.assertTensorsAlmostEqual(R_mul, R_best)
     self.assertTensorsAlmostEqual(F_mul, F_best)
def calc_bert_score(cands, refs):
    """ BERTスコアの算出

    Args:
        cands ([List[str]]): [比較元の文]
        refs ([List[str]]): [比較対象の文]

    Returns:
        [(List[float], List[float], List[float])]: [(Precision, Recall, F1スコア)]
    """
    Precision, Recall, F1 = score(cands, refs, lang="ja", verbose=True)
    return Precision.numpy().tolist(), Recall.numpy().tolist(), F1.numpy(
    ).tolist()