def run_bertscore(mt: list, ref: list, model_type="xlm-roberta-base", language=False, idf=False) -> (list, list, list): """ Runs BERTScores and returns precision, recall and F1 BERTScores .""" if language: precison, recall, f1 = bert_score.score( cands=mt, refs=ref, idf=idf, batch_size=32, lang=language, rescale_with_baseline=False, verbose=True, nthreads=4, ) else: precison, recall, f1 = bert_score.score( cands=mt, refs=ref, idf=idf, batch_size=32, model_type=model_type, rescale_with_baseline=False, verbose=True, nthreads=4, ) return precison, recall, f1
def read_a_business(review_list): most_useful_adj_list = [] most_useful_noun_list = [] most_useful_adj_counter = collections.Counter() most_useful_noun_counter = collections.Counter() pair_list = [] for text in review_list: adj_list, noun_list = pos_tagging(text) if adj_list == [] or noun_list == []: continue text = pre_processing(text) P_noun, _, _ = score(noun_list, len(noun_list) * [text], "bert-base-uncased") P_adj, _, _ = score(adj_list, len(adj_list) * [text], "bert-base-uncased") adj_index = torch.argmax(P_adj).item() noun_index = torch.argmax(P_noun).item() pair = tuple((adj_list[adj_index], noun_list[noun_index])) pair_list.append(pair) return pair_list
def bert_agreement(pred, ref): (P, R, F1), hash_code = score(pred, ref, lang="en", verbose=True, return_hash=True, idf=True) # (P_pivot, R_pivot, F1_pivot), _ = score(concept, ref, lang="en", verbose=True, return_hash=True, idf=False) print(hash_code) # print(f"System level P score: {(P.mean()-P_pivot.mean())*100:.2f}") # print(f"System level R score: {(R.mean()-R_pivot.mean())*100:.2f}") # print(f"System level F1 score: {(F1.mean()-F1_pivot.mean())*100:.2f}") print(f"System level P score: {(P.mean())*100:.2f}") print(f"System level R score: {(R.mean())*100:.2f}") print(f"System level F1 score: {(F1.mean())*100:.2f}") (P_pivot, R_pivot, F1_pivot), _ = score(concept, ref, lang="en", verbose=True, return_hash=True, idf=False) print(f"Base System level P score: {(P_pivot.mean())*100:.2f}") print(f"Base System level R score: {(R_pivot.mean())*100:.2f}") print(f"Base System level F1 score: {(F1_pivot.mean())*100:.2f}")
def _calc_metrics_info(self, generate_corpus, reference_corpus): transformers.tokenization_utils.logger.setLevel(logging.ERROR) transformers.configuration_utils.logger.setLevel(logging.ERROR) transformers.modeling_utils.logger.setLevel(logging.ERROR) generate_corpus = [ self._preprocess(generate_sentence) for generate_sentence in generate_corpus ] reference_corpus = [ self._preprocess(reference_sentence) for reference_sentence in reference_corpus ] result = {} if self.model == None: P, R, F1 = score(generate_corpus, reference_corpus, lang=self.lang, verbose=False) else: if self.num_layers == None: raise ValueError("num_layer should be an integer") P, R, F1 = score(generate_corpus, reference_corpus, model_type=self.model, num_layers=self.num_layers, verbose=False) result['bert-score'] = F1.tolist() return result
def bert_pairwise_cos_sim(src, tgt, idf=False): from bert_score import score if idf: p, _, _ = score([src], [tgt], lang='en', idf=True) return p.item() p, _, _ = score([src], [tgt], lang='en') return p.item()
def get_bertscore(cand_sentences, ref_sentences, model, layer, language, scoring_approach): """ BERTScore metric, from the paper https://arxiv.org/pdf/1904.09675.pdf Args: - :param: `cand_sentences` (list of str): candidate summary sentences - :param: `ref_sentences` (list of str): reference summary sentences - :param: `model` (str): the specific bert model to use - :param: `Layer` (int): the layer of representation to use. - :param: `language` (str): language of the inputs. performance may vary for non-english langauges on english pre-trained bert models - :param: `scoring_approach` (str): defines whether to use the argmax or mean-based scoring approaches. argmax returns the score of the highest scoring reference sentence for each candidate sentence mean-based returns the mean of all reference sentence scores for each candidate sentence Return: - :param: precision score (float): precision score for the candidate summary - :param: recall score (float): recall score for the candidate summary - :param: f1 score (float): f1 score for the candidate summary """ final_precision_scores = [] final_recall_scores = [] final_f1_scores = [] if scoring_approach == 'argmax': for cand_sent in cand_sentences: p, r, f1 = bert_score.score([cand_sent], [ref_sentences], model_type = model, num_layers = layer, lang = language) # BERTscore defaults to taking the argmax value when multiple references are given for 1 candidate sentence final_precision_scores.append(p.tolist()[0]) final_recall_scores.append(r.tolist()[0]) final_f1_scores.append(f1.tolist()[0]) elif scoring_approach == 'mean': for cand_sent in cand_sentences: precision_scores = 0.0 recall_scores = 0.0 f1_scores = 0.0 for ref_sent in ref_sentences: p, r, f1 = bert_score.score([cand_sent], [ref_sent], model_type = model, num_layers = layer, lang = language) # BERTscore is the argmax of each word-comparision, we take the mean of the total argmax score for each candidate sentence precision_scores += p.tolist()[0] recall_scores += r.tolist()[0] f1_scores += f1.tolist()[0] # Divide with len(ref_sentences) to get the mean BERTscore for each candidate sentence final_precision_scores.append(precision_scores / len(ref_sentences)) final_recall_scores.append(recall_scores / len(ref_sentences)) final_f1_scores.append(f1_scores / len(ref_sentences)) else: print("scoring_approach parameter must be defined as either 'argmax' or 'mean'. Check the README for descriptions of each.") return None # Final score is simply the average of the precision, recall and f1 score of each sentence in the candidate summary precision_score = sum(final_precision_scores) / len(final_precision_scores) recall_score = sum(final_recall_scores) / len(final_recall_scores) f1_score = sum(final_f1_scores) / len(final_f1_scores) return precision_score, recall_score, f1_score
def calc_scores(self): super().calc_scores() # write input_tsv self.create_input_tsv() # read data with open(self.config['input_tsv'], 'r+', encoding='utf-8') as f_in: cands = [] refs = [] reader = csv.DictReader(f_in, dialect='excel-tab') # tsv reader for row in reader: refs.append(row['sentence1']) cands.append(row['sentence2']) # calc scores P, R, F = bert_score.score(cands, refs, idf=False, lang='en', rescale_with_baseline=True) # write scores output_str = '\n'.join(['{:.5f}'.format(e) for e in F.tolist()]) + '\n' with open(self.config['cache_file'], 'w', encoding='utf-8') as f_out: f_out.write(output_str)
def official_bert_score(fidlist, reflist, predlist): from bert_score import score refs = list() preds = list() p_bert = dict() r_bert = dict() f1_bert = dict() for ref, pred in zip(reflist, predlist): ref = ' '.join(ref).strip() pred = ' '.join(pred).strip() refs.append(ref) preds.append(pred) p, r, f1 = score(preds, refs, lang='en', rescale_with_baseline=True) for fid, pscore, rscore, f1score in zip(fidlist, p.numpy(), r.numpy(), f1.numpy()): p_bert[fid] = pscore r_bert[fid] = rscore f1_bert[fid] = f1score pickle.dump(p_bert, open(outpath+'/pbert.pkl', 'wb')) pickle.dump(r_bert, open(outpath+'/rbert.pkl', 'wb')) pickle.dump(f1_bert, open(outpath+'/f1bert.pkl', 'wb')) avg_p = sum(list(p_bert.values()))/len(p_bert) avg_r = sum(list(r_bert.values()))/len(r_bert) avg_f1 = sum(list(f1_bert.values()))/len(f1_bert) ret = ('for %s functions\n' % (len(predlist))) ret+= 'precision bertscore using official bert-score repo %s\n' % (round(avg_p*100, 2)) ret+= 'recall bertscore using official bert-score repo %s\n' % (round(avg_r*100, 2)) ret+= 'f1 bertscore using official bert-score repo %s\n' % (round(avg_f1*100, 2)) return ret
def read_a_business(review_list): most_useful_adj_list = collections.Counter() most_useful_noun_list = collections.Counter() for text in text_list: adj_list, noun_list = pos_tagging(text) P_noun, _, _ = score(noun_list, len(noun_list) * [text], "bert-base-uncased") P_adj, _, _ = score(adj_list, len(adj_list) * [text], "bert-base-uncased") most_useful_adj_list.update(P_adj.index(max(P_adj))) most_useful_noun_list.update(P_noun.index(max(P_noun))) return most_useful_adj_list, most_useful_noun_list
def compute_score(self, gts, res): assert(gts.keys() == res.keys()) imgIds = gts.keys() hyp_input = [] ref_input = [] same_indices = [] for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) >= 1) hyp_input += [hypo[0]] * len(ref) ref_input += ref same_indices.append(len(ref_input)) p, r, f_scores = score(hyp_input, ref_input, model_type="bert-base-uncased") prev_idx = 0 aggreg_f1_scores = [] for idx in same_indices: aggreg_f1_scores.append(f_scores[prev_idx: idx].mean().cpu().item()) prev_idx = idx return sum(aggreg_f1_scores)/len(aggreg_f1_scores), aggreg_f1_scores
def test_idf_score_rescale_fast_tokenizer(self): (P, R, F), hash_code = bert_score.score( cands, refs, model_type="roberta-large", num_layers=17, idf=True, batch_size=3, return_hash=True, lang="en", rescale_with_baseline=True, use_fast_tokenizer=True, ) self.assertAreTensors(P, R, F) self.assertTensorsAlmostEqual( P, [0.9060347080230713, 0.8529528975486755, 0.4002779722213745]) self.assertTensorsAlmostEqual( R, [0.907024621963501, 0.8212453722953796, 0.514383852481842]) self.assertTensorsAlmostEqual( F, [0.9066815376281738, 0.8373198509216309, 0.45761245489120483]) self.assertEqual( hash_code, f"roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled_fast-tokenizer", )
def compute_score(self, reference_sents, generated_sents, verbose=False): """ Main function to compute CIDEr score :param res (list) : list of dictionaries with image ic and tokenized hypothesis / candidate sentence gts (dict) : dictionary with key <image id> and value <tokenized reference sentence> :return: cider (float) : computed CIDEr score for the corpus """ if verbose: print("======== generated sentences ========\n", generated_sents) print("======== reference sentences ========\n", reference_sents) output = score(generated_sents, reference_sents, lang=self.lang, verbose=verbose, rescale_with_baseline=True, idf=True) precision, recall, f1_scores = output if self.metric == 'recall': scores = recall elif self.metric == 'precision': scores = precision else: scores = f1_scores scores = np.array(scores) return scores.mean(), scores
def test_idf_score(self): (P, R, F), hash_code = bert_score.score(cands, refs, model_type='roberta-large', num_layers=17, idf=True, batch_size=3, return_hash=True) # print(P.tolist(), R.tolist(), F.tolist()) self.assertTrue(torch.is_tensor(P)) self.assertTrue(torch.is_tensor(R)) self.assertTrue(torch.is_tensor(F)) self.assertEqual( hash_code, f'roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})' ) self.assertTrue((P - torch.tensor( [0.9837872385978699, 0.9754738807678223, 0.8947395086288452]) ).abs_().max() < EPS) self.assertTrue((R - torch.tensor( [0.9827190637588501, 0.9697767496109009, 0.9172918796539307]) ).abs_().max() < EPS) self.assertTrue((F - torch.tensor( [0.9832529425621033, 0.972616970539093, 0.9058753848075867]) ).abs_().max() < EPS)
def test_idf_score_rescale(self): (P, R, F), hash_code = bert_score.score(cands, refs, model_type='roberta-large', num_layers=17, idf=True, batch_size=3, return_hash=True, rescale_with_baseline=True) # print(P.tolist(), R.tolist(), F.tolist()) self.assertTrue(torch.is_tensor(P)) self.assertTrue(torch.is_tensor(R)) self.assertTrue(torch.is_tensor(F)) self.assertEqual( hash_code, f'roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled' ) self.assertTrue((P - torch.tensor( [0.903778135776520, 0.854439020156860, 0.375287383794785]) ).abs_().max() < EPS) self.assertTrue((R - torch.tensor( [0.897446095943451, 0.820639789104462, 0.509167850017548]) ).abs_().max() < EPS) self.assertTrue((F - torch.tensor( [0.900772094726562, 0.837753534317017, 0.442304641008377]) ).abs_().max() < EPS)
def calculate_metric_scores(cands, refs): """ calculate Rouge-1 precision, Bert precision and Entailment Scores """ # calculate rouge-1 precision rouge = Rouge() rouge1_p = [] for r, c in tqdm(zip(refs, cands)): r = " ".join(list(nlp_parser.tokenize(r))).lower() c = " ".join(list(nlp_parser.tokenize(c))).lower() scores = rouge.get_scores(c, r)[0] rouge1_p.append(round(scores['rouge-1']['p'], 4)) # calculate bert precision P, _, _ = score(cands, refs, lang='en', verbose=True) P = [round(x, 4) for x in P.tolist()] ## calculate entaiment score url = 'http://localhost:5003/roberta_mnli_classifier' # 'http://nlp1.cs.unc.edu:5003/roberta_mnli_classifier' mnli_data = [] for p, h in zip(refs, cands): mnli_data.append({'premise': p, 'hypo': h}) r = requests.post(url, json=mnli_data) results = r.json() ent_scores = [] for ind, d in enumerate(results): ent_scores.append(float(d['entailment'])) return rouge1_p, P, ent_scores
def test_score_rescale(self): (P, R, F), hash_code = bert_score.score(cands, refs, model_type='roberta-large', num_layers=17, idf=False, batch_size=3, return_hash=True, rescale_with_baseline=True) # print(P.tolist(), R.tolist(), F.tolist()) self.assertTrue(torch.is_tensor(P)) self.assertTrue(torch.is_tensor(R)) self.assertTrue(torch.is_tensor(F)) self.assertEqual( hash_code, f'roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled' ) self.assertTrue((P - torch.tensor( [0.907000780105591, 0.900435566902161, 0.477955609560013]) ).abs_().max() < EPS) self.assertTrue((R - torch.tensor( [0.895456790924072, 0.841467440128326, 0.527785062789917]) ).abs_().max() < EPS) self.assertTrue((F - torch.tensor( [0.901383399963379, 0.871010780334473, 0.503565192222595]) ).abs_().max() < EPS)
def test_score(self): (P, R, F), hash_code = bert_score.score(cands, refs, model_type='roberta-large', num_layers=17, idf=False, batch_size=3, return_hash=True) # print(P.tolist(), R.tolist(), F.tolist()) self.assertTrue(torch.is_tensor(P)) self.assertTrue(torch.is_tensor(R)) self.assertTrue(torch.is_tensor(F)) self.assertEqual( hash_code, f'roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})' ) self.assertTrue((P - torch.tensor( [0.9843302369117737, 0.9832239747047424, 0.9120386242866516]) ).abs_().max() < EPS) self.assertTrue((R - torch.tensor( [0.9823839068412781, 0.9732863903045654, 0.920428991317749]) ).abs_().max() < EPS) self.assertTrue((F - torch.tensor( [0.9833561182022095, 0.9782299995422363, 0.916214644908905]) ).abs_().max() < EPS)
def score( self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None ) -> VizSeqScore: corpus_score, sent_scores, group_scores = None, None, None import bert_score as bs import langid import logging logging.getLogger('pytorch_pretrained_bert').setLevel(logging.WARNING) logging.getLogger('langid').setLevel(logging.WARNING) lang = langid.classify(references[0][0])[0] sent_scores = bs.score( hypothesis, references[0], nthreads=self.n_workers, lang=lang, verbose=self.verbose )[2].tolist() if self.corpus_level: corpus_score = np.mean(sent_scores) if tags is not None: tag_set = self._unique(tags) group_scores = {} for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] group_scores[t] = np.mean([sent_scores[i] for i in indices]) return VizSeqScore.make( corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores )
def bert_score(x_gt_pred_file, yy1_gt_pred_file, yy2_gt_pred_file, yy3_gt_pred_file): cands, refs = load_f(x_gt_pred_file, yy1_gt_pred_file, yy2_gt_pred_file, yy3_gt_pred_file) P, R, F = score(cands, refs, bert="bert-base-uncased") #"pytorch-transformers") return P, R, F
def bert_pairwise_cos_sim(sentences, idf=False): """BERTScore similarity func """ src_len = len(sentences) refs = [[sentence] * src_len for sentence in sentences] refs = list(itertools.chain(*refs)) hyps = sentences * src_len if idf: p, _, _ = score(refs, hyps, lang="en", idf=True) p = p.reshape(src_len, -1) return p.detach().numpy() p, _, _ = score(refs, hyps, lang="en") p = p.reshape(src_len, -1) return p.detach().numpy()
def test_multi_refs(self): cands = ['I like lemons.'] refs = [['I am proud of you.', 'I love lemons.', 'Go go go.']] P_mul, R_mul, F_mul = bert_score.score(cands, refs, batch_size=3, return_hash=False, lang="en", rescale_with_baseline=True) P_best, R_best, F_best = bert_score.score(cands, [refs[0][1]], batch_size=3, return_hash=False, lang="en", rescale_with_baseline=True) self.assertTrue((P_mul - P_best).abs_().max() < EPS) self.assertTrue((R_mul - R_best).abs_().max() < EPS) self.assertTrue((F_mul - F_best).abs_().max() < EPS)
def bert_pairwise_cos_sim(sentences, idf=False): import itertools from bert_score import score src_len = len(sentences) refs = [[sentence] * src_len for sentence in sentences] refs = list(itertools.chain(*refs)) hyps = sentences * src_len if idf: p, _, _ = score(refs, hyps, lang='en', idf=True) p = p.reshape(src_len, -1) return p.detach().numpy() p, _, _ = score(refs, hyps, lang='en') p = p.reshape(src_len, -1) return p.detach().numpy()
def bert_score_reward_no_stop_word(pred_str_list, pred_sent_2d_list, trg_str_list, trg_sent_2d_list, batch_size, device): stop_words = stopwords.words('english') # remove stop words pred_str_list = [w for w in pred_str_list if w not in stop_words] trg_str_list = [w for w in trg_str_list if w not in stop_words] P, R, F1 = score(' '.join(pred_str_list), ' '.join(trg_str_list)) return F1.to(device)
def gen_bert_score(df, sent1, sent2, model_type, layer): p, r, f1 = bert_score.score(df[sent1].values.tolist(), df[sent2].values.tolist(), model_type=model_type, num_layers=layer, lang='en', batch_size=256) return f1.tolist()
def bert_score_compute(st_cands, st_ref, lang): cands = st_cands.split(".") refs = st_ref.split(".") P, R, F1 = score(cands, refs, lang=lang, model_type="bert-base-multilingual-cased", verbose=True) return round(float(P.mean()), 2), float(R.mean()), round(float(F1.mean()), 3)
def cal_BERTScore(refer, candidate): # too slow, f**k it _, _, bert_scores = score(candidate, refer, lang='zh', rescale_with_baseline=True) bert_scores = bert_scores.tolist() bert_scores = [ 0.5 if math.isnan(score) else score for score in bert_scores ] return np.mean(bert_scores)
def _run(self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]]) -> List[List[MetricsDict]]: summaries_list = [[flatten(summary) for summary in summaries] for summaries in summaries_list] references_list = [[flatten(reference) for reference in references] for references in references_list] # Create the candidate and reference lists for passing to the scoring function input_candidates = [] input_references = [] empty_inputs = set() for i, (summaries, references) in enumerate(zip(summaries_list, references_list)): for j, summary in enumerate(summaries): if len(summary) == 0: empty_inputs.add((i, j)) else: input_candidates.append(summary) input_references.append(references) # Score the summaries precisions, recalls, f1s = bert_score.score( input_candidates, input_references, model_type=self.model_type, num_layers=self.num_layers, idf=False, nthreads=self.nthreads, batch_size=self.batch_size, lang=self.lang, verbose=self.verbose ) # Remap the scores to the summaries index = 0 metrics_lists = [] for i, summaries in enumerate(summaries_list): metrics_lists.append([]) for j, summary in enumerate(summaries): if (i, j) in empty_inputs: precision, recall, f1 = 0.0, 0.0, 0.0 else: precision = precisions[index].item() recall = recalls[index].item() f1 = f1s[index].item() index += 1 metrics_lists[-1].append(MetricsDict({ 'bertscore': { 'precision': precision, 'recall': recall, 'f1': f1, } })) return metrics_lists
def evaluate_example(self, summary, reference): assert not self.idf, "idf mode not supported for evaluating a single example" if isinstance(reference, str): reference = [reference] all_preds, hash_code = bert_score.score([summary], reference, model_type=self.model_type, \ num_layers=self.num_layers, verbose=self.verbose, idf=self.idf, batch_size=self.batch_size, nthreads=self.nthreads, lang=self.lang, return_hash=True, rescale_with_baseline=self.rescale_with_baseline) print(f"hash_code: {hash_code}") score = [{"bert_score_precision": p.cpu().item(), "bert_score_recall": r.cpu().item(), "bert_score_f1": \ f.cpu().item()} for (p, r, f) in all_preds] return score
def test_multi_refs(self): cands = ["I like lemons."] refs = [["I am proud of you.", "I love lemons.", "Go go go."]] P_mul, R_mul, F_mul = bert_score.score( cands, refs, batch_size=3, return_hash=False, lang="en", rescale_with_baseline=True, ) P_best, R_best, F_best = bert_score.score( cands, [refs[0][1]], batch_size=3, return_hash=False, lang="en", rescale_with_baseline=True, ) self.assertTensorsAlmostEqual(P_mul, P_best) self.assertTensorsAlmostEqual(R_mul, R_best) self.assertTensorsAlmostEqual(F_mul, F_best)
def calc_bert_score(cands, refs): """ BERTスコアの算出 Args: cands ([List[str]]): [比較元の文] refs ([List[str]]): [比較対象の文] Returns: [(List[float], List[float], List[float])]: [(Precision, Recall, F1スコア)] """ Precision, Recall, F1 = score(cands, refs, lang="ja", verbose=True) return Precision.numpy().tolist(), Recall.numpy().tolist(), F1.numpy( ).tolist()