def get_scores(enc_sources, enc_target_sents, model, device, tokenizersrc, tokenizertrg, search="greedy", n=4): """ takes a list of sentences and their translations in string form and returns score objects model is the trained transformer model tokenizer is the spm sentencpiece vocabulary in the form "name".model search is the decoding strategy, either greedy or beam search n is the beam width in beam search """ model.eval() sp.load(tokenizertrg) targets = [] outputs = [] target_str = [sp.DecodeIds(sent.tolist()) for sent in enc_target_sents] output_str = [] if search == "greedy": x = divide_chunks(enc_sources, 100) output_str = [] for sents in x: print((len(output_str) / len(enc_sources)) * 100, end="\r") y = translate_enc_sentences(model, sents, device, tokenizertrg, max_length=150) output_str.extend(y) bleu = sacrebleu.corpus_bleu(output_str, [target_str]) chrf = sacrebleu.corpus_chrf(output_str, [target_str]) ter = sacrebleu.corpus_ter(output_str, [target_str]) return bleu, chrf, ter elif search == "beam": prediction = beam_search(source, device, tokenizersrc, tokenizertrg, n) sp.Load(tokenizertrg) target = sp.DecodeIds(target.tolist()) targets.append([target.split()]) target_str.append(target) outputs.append(prediction.split()) output_str.append(prediction) bleu = sacrebleu.corpus_bleu(output_str, [target_str]) chrf = sacrebleu.corpus_chrf(output_str, [target_str]) ter = sacrebleu.corpus_ter(output_str, [target_str]) return bleu, chrf, ter
def eval_metric(args, hypos, ref): if args.metric == "bleu": score = sacrebleu.corpus_bleu(hypos, [ref]).score else: score = sacrebleu.corpus_ter(hypos, [ref]).score return score
def _compute_bleu_ter(self, batch, user_vec, glob_vec, output, target): vocab = self.bert_tok.ids_to_tokens batch_data = self.translator.translate_batch(batch, vocab, False) translations = self.from_batch(batch_data) sys = [] for trans in translations: line = ' '.join(trans[0]).replace(' ##', '').replace('##', '') sys.append(line) references = [] for tgt in batch.tgt.squeeze(-1).transpose(0, 1): references.append(self._build_target_tokens(tgt)) refs = [] for ref in references: line = ' '.join(ref[1:]).replace(' ##', '').replace('##', '') refs.append(line) bleu = sacrebleu.corpus_bleu(sys, [refs], force=True) ter = sacrebleu.corpus_ter(sys, [refs]) bleu_stats = onmt.utils.Statistics(bleu=bleu.score * len(sys), sent=len(sys)) ter_stats = onmt.utils.Statistics(ter=ter.score * len(sys), sent=len(sys)) return (bleu_stats, ter_stats)
def compute_metrics(hyp_dec_all, ref_dec_all, use_sacrebleu=True, use_torchtext=True, use_ter=False): metrics = {} # Sacrebleu if use_sacrebleu: metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf( hyp_dec_all, [ref_dec_all]).score if use_ter: # Quite slow metrics["sacrebleu_ter"] = sacrebleu.corpus_ter( hyp_dec_all, [ref_dec_all]).score # Torchtext if use_torchtext: m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all], [[x.split(" ")] for x in ref_dec_all]) metrics["torchtext_bleu"] = m_bleu_score * 100 return metrics
def compute_metrics(ref, hyp, hyp_order, metric): # Read sentences refs = [] hyps = [] for id in hyp_order: for segment in hyp[id]: hyps.append(segment) try: for segment in ref[id]: refs.append(segment) except KeyError: sys.stderr.write('Error: there are no references for document' + ' "' + id + '"\n') sys.exit(-1) scores = [] dir = os.path.dirname(os.path.realpath(sys.argv[0])) for n in range(len(hyps)): if metric == 'bleu': try: score = sacrebleu.corpus_bleu([hyps[n]], [[refs[n]]]) except EOFError: sys.stderr.write('Error: source and reference have different' + ' lengths.\n') sys.exit(-1) elif metric == 'ter': try: score = sacrebleu.corpus_ter([hyps[n]], [[refs[n]]]) except EOFError: sys.stderr.write('Error: source and reference have different' + ' lengths.\n') sys.exit(-1) else: hyps_file = save_to_file(hyps[n]) refs_file = save_to_file(refs[n]) try: process = subprocess.Popen((dir + '/beer_2.0/beer -s ' + hyps_file + ' -r ' + refs_file).split(), stdout=subprocess.PIPE) score, error = process.communicate() except FileNotFoundError: sys.stderr.write('Error: Beer requirement has not been' + 'satisfied.\n') sys.exit(-1) # Delete aux files process = subprocess.Popen(('rm ' + hyps_file + ' ' + refs_file).split(), stdout=subprocess.PIPE) output, error = process.communicate() if metric == 'beer': scores.append([float(score.split()[-1])]) else: scores.append([score.score]) return scores
def ter(items): """Translation Error Rate is an error metric for machine translation that measures the number of edits required to change a system output into one of the references Source: http://www.cs.umd.edu/~snover/tercom/ Paper: http://mt-archive.info/AMTA-2006-Snover.pdf Lower is better """ refs = list(zip(*items))[0] preds = list(zip(*items))[1] refs, preds = _sacreformat(refs, preds) return sacrebleu.corpus_ter(preds, refs).score
def compute_metrics(ref, hyp, hyp_order): # Read sentences refs = [] hyps = [] for id in hyp_order: for segment in hyp[id]: hyps.append(segment) try: for segment in ref[id]: refs.append(segment) except KeyError: sys.stderr.write('Error: there are no references for document' + ' "' + id + '"\n') sys.exit(-1) # Compute BLEU and TER try: bleu = sacrebleu.corpus_bleu(hyps, [refs]) ter = sacrebleu.corpus_ter(hyps, [refs]) except EOFError: sys.stderr.write('Error: source and reference have different' + ' lengths.\n') sys.exit(-1) # Create aux files for BEER dir = os.path.dirname(os.path.realpath(sys.argv[0])) hyps_file = save_to_file(hyps) refs_file = save_to_file(refs) # Compute BEER try: process = subprocess.Popen((dir + '/beer_2.0/beer -s ' + hyps_file + ' -r ' + refs_file).split(), stdout=subprocess.PIPE) beer, error = process.communicate() except FileNotFoundError: sys.stderr.write('Error: Beer requirement has not been satisfied.\n') sys.exit(-1) # Delete aux files process = subprocess.Popen(('rm ' + hyps_file + ' ' + refs_file).split(), stdout=subprocess.PIPE) output, error = process.communicate() return bleu.score, ter.score, float(beer.split()[-1])
def __call__(self, ref_path, hyp_path): with tf.io.gfile.GFile(ref_path) as ref_stream, tf.io.gfile.GFile(hyp_path) as sys_stream: ter = sacrebleu.corpus_ter(sys_stream, [ref_stream]) return ter.score
def __call__(self, ref_path, hyp_path): sys_stream = _get_lines(hyp_path) ref_stream = _get_lines(ref_path) ter = sacrebleu.corpus_ter(sys_stream, [ref_stream]) return ter.score
def test( experiment: str, test_set: str, cuda: bool, seed: int, sample: bool, top_p: float, temperature: float, num_beams: int, to_json: str, ) -> None: """Testing function where a trained model is tested in its ability to rank candidate answers and produce replies. """ logging.disable(logging.WARNING) model = PersonaGPT2.from_experiment(experiment) seed_everything(seed) cuda = cuda and torch.cuda.is_available() if cuda: model.to("cuda") with open(test_set, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) replies, rankings = [], [] for dialog in tqdm(dataset, desc="Scoring dialogs...", dynamic_ncols=True): # 1) Prepares Persona persona = dialog["personality"].copy() persona_ids = [model.tokenizer.encode(s) for s in persona] for utterance in dialog["utterances"]: # 2) Saves Ground-Truth ground_truth_reply = utterance["candidates"][-1] # 3) Prepares History history = utterance["history"][-(2 * model.hparams.max_history + 1):] history_ids = [model.tokenizer.encode(h) for h in history] # 4) Rank Candidates in batch: batch = [] for j, candidate in enumerate(utterance["candidates"]): candidate_ids = model.tokenizer.encode(candidate) instance = DataModule.build_input( tokenizer=model.tokenizer, persona=persona_ids, history=history_ids, reply=candidate_ids, ) batch.append(instance) # from list of dictionaries to dictionary of lists batch = {k: [d[k] for d in batch] for k in batch[0]} batch = DataModule.pad_dataset(batch) if cuda: batch = { k: torch.LongTensor(v).cuda() for k, v in batch.items() } else: batch = {k: torch.LongTensor(v) for k, v in batch.items()} mc_logits = model(**batch).mc_logits rankings.append({ "persona": persona, "history": history, "candidates": utterance["candidates"], "ranking": torch.topk(mc_logits, len(utterance["candidates"])).indices.tolist(), }) # 5) Generates Reply bot_input = DataModule.build_input(tokenizer=model.tokenizer, persona=persona_ids, history=history_ids) # Nucleus Sampling if sample: history_ids = model.generate( input_ids=torch.LongTensor([bot_input["input_ids"] ]).cuda() if cuda else torch.LongTensor([bot_input["input_ids"]]), token_type_ids=torch.LongTensor( [bot_input["token_type_ids"]]).cuda() if cuda else torch.LongTensor([bot_input["token_type_ids"]]), max_length=200, do_sample=True, top_p=top_p, temperature=0.7, ) # Beam Search else: history_ids = model.generate( input_ids=torch.LongTensor([bot_input["input_ids"] ]).cuda() if cuda else torch.LongTensor([bot_input["input_ids"]]), token_type_ids=torch.LongTensor( [bot_input["token_type_ids"]]).cuda() if cuda else torch.LongTensor([bot_input["token_type_ids"]]), max_length=200, num_beams=num_beams, no_repeat_ngram_size=2, early_stopping=True, ) bot_reply_ids = history_ids[:, len(bot_input["input_ids"]):][0] bot_reply = model.tokenizer.decode(bot_reply_ids, skip_special_tokens=True) replies.append({ "persona": persona, "history": history, "bot": " ".join(wordpunct_tokenize(bot_reply.lower())), "human": ground_truth_reply, }) # 6) Runs Ranking Metrics hits_1, hits_5, hits_10 = [], [], [] for ranks in rankings: hits_1.append((len(ranks["candidates"]) - 1) in ranks["ranking"][:1]) hits_5.append((len(ranks["candidates"]) - 1) in ranks["ranking"][:5]) hits_10.append((len(ranks["candidates"]) - 1) in ranks["ranking"][:10]) click.secho("Hits@1: {}".format(sum(hits_1) / len(hits_1)), fg="yellow") click.secho("Hits@5: {}".format(sum(hits_5) / len(hits_5)), fg="yellow") click.secho("Hits@10: {}".format(sum(hits_10) / len(hits_10)), fg="yellow") # 7) Runs Generation Metrics refs = [[s["human"] for s in replies]] sys = [s["bot"] for s in replies] bleu = sacrebleu.corpus_bleu(sys, refs, lowercase=True, tokenize="intl").score click.secho(f"BLEU: {bleu}", fg="blue") ter = sacrebleu.corpus_ter(sys, refs, no_punct=True).score click.secho(f"TER: {ter}", fg="blue") # BERTScore returns precison, recall, f1.. we will use F1 bertscore = float( bert_score.score( cands=sys, refs=refs[0], lang="en", verbose=False, nthreads=4, )[2].mean()) click.secho(f"BERTScore: {bertscore}", fg="blue") # 8) Saves results. if isinstance(to_json, str): data = { "results": { "BLEU": bleu, "TER": ter, "BERTScore": bertscore, "Hits@1": sum(hits_1) / len(hits_1), "Hits@5": sum(hits_5) / len(hits_5), "Hits@10": sum(hits_10) / len(hits_10), }, "generation": replies, "ranking": rankings, } with open(to_json, "w") as outfile: json.dump(data, outfile, ensure_ascii=False, indent=4) click.secho(f"Predictions saved in: {to_json}.", fg="yellow")
def get_ter(in_sent, target_sent): ter = sacrebleu.corpus_ter([in_sent], [[target_sent]]) out = " ".join(map(str, [ter.score, ter.num_edits, ter.ref_length])) return out
def compute_ter_score(hyps: Iterable[str], refs: List[Iterable[str]]) -> float: result = sacrebleu.corpus_ter(hyps, refs) return float(np.round(float(result.score) * 100, 2))