def test_bleu_scores(self, pair, min_bleu_score): # note: this test is not testing the best performance since it only evals a small batch # but it should be enough to detect a regression in the output quality mname = f"facebook/wmt19-{pair}" tokenizer = self.get_tokenizer(mname) model = self.get_model(mname) src_sentences = bleu_data[pair]["src"] tgt_sentences = bleu_data[pair]["tgt"] batch = tokenizer(src_sentences, return_tensors="pt", truncation=True, padding="longest").to(torch_device) outputs = model.generate( input_ids=batch.input_ids, num_beams=8, ) decoded_sentences = tokenizer.batch_decode( outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False) scores = calculate_bleu(decoded_sentences, tgt_sentences) print(scores) self.assertGreaterEqual(scores["bleu"], min_bleu_score)
def calc_generative_metrics(self, preds, target) -> dict: return calculate_bleu(preds, target)
def translation_metrics(pred: EvalPrediction) -> Dict: pred_str, label_str = decode_pred(pred) bleu: Dict = calculate_bleu(pred_str, label_str) gen_len = np.mean(lmap(non_pad_len, pred.predictions)) bleu.update({"gen_len": gen_len}) return bleu