def get_total_metrics(sys1_path, sys2_path, ref_path, lowercase=False, max_order=1, sys1_name='sys1', sys2_name='sys2', filepath=None): with open(sys1_path, 'rt') as sys1_file: with open(sys2_path, 'rt') as sys2_file: with open(ref_path, 'rt') as ref_file: sys1_list = [line.strip() for line in sys1_file.readlines()] sys2_list = [line.strip() for line in sys2_file.readlines()] ref_list = [line.strip() for line in ref_file.readlines()] mf1_sys1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order) mf1_sys2 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order) mf1_sys1_f1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order, measure_name='f1') mf1_sys2_f1 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], lowercase=lowercase, average='macro', max_order=max_order, measure_name='f1') mf1_sys1_new = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], average='macro', word_class=True, max_order=max_order) mf1_sys2_new = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], average='macro', word_class=True, max_order=max_order) mf1_sys1_new_f1 = sacrebleu.corpus_rebleu2(sys1_list, [ref_list], average='macro', word_class=True, max_order=max_order, measure_name='f1') mf1_sys2_new_f1 = sacrebleu.corpus_rebleu2(sys2_list, [ref_list], average='macro', word_class=True, max_order=max_order, measure_name='f1') bleu_sys1 = sacrebleu.corpus_bleu(sys1_list, [ref_list], lowercase=lowercase) bleu_sys2 = sacrebleu.corpus_bleu(sys2_list, [ref_list], lowercase=lowercase) chrf_sys1 = sacrebleu.corpus_chrf(sys1_list, ref_list) chrf_sys2 = sacrebleu.corpus_chrf(sys2_list, ref_list) micro_perc_sys1, macro_perc_sys1, total_list_sys1, en_list_sys1, total_sys1, en_sys1 = get_percent_en(sys1_list) micro_perc_sys2, macro_perc_sys2, total_list_sys2, en_list_sys2, total_sys2, en_sys2 = get_percent_en(sys2_list) # bleurt_checkpoint = "/Users/weiqiuyou/Documents/USC_ISI/QUM/tools/bleurt/bleurt/bleurt-base-128" # scorer = score.BleurtScorer(bleurt_checkpoint) # bleurt_sys1 = np.mean(scorer.score(ref_list, sys1_list)) # bleurt_sys2 = np.mean(scorer.score(ref_list, sys2_list)) report = '' report += f'mf1_{sys1_name}: {mf1_sys1}\n' report += f'mf1_{sys2_name}: {mf1_sys2}\n' report += f'mf1_{sys1_name}_f1: {mf1_sys1_f1}\n' report += f'mf1_{sys2_name}_f1: {mf1_sys2_f1}\n' report += f'mf1_new_{sys1_name}: {mf1_sys1_new}\n' report += f'mf1_new_{sys2_name}: {mf1_sys2_new}\n' report += f'mf1_new_{sys1_name}_f1: {mf1_sys1_new_f1}\n' report += f'mf1_new_{sys2_name}_f1: {mf1_sys2_new_f1}\n' report += f'bleu_{sys1_name}: {bleu_sys1}\n' report += f'bleu_{sys2_name}: {bleu_sys2}\n' report += f'chrf_{sys1_name}: {chrf_sys1}\n' report += f'chrf_{sys2_name}: {chrf_sys2}\n' report += f'micro_perc_{sys1_name}: {micro_perc_sys1}\tmacro_perc_{sys1_name}: {macro_perc_sys1}\n' report += f'micro_perc_{sys2_name}: {micro_perc_sys2}\tmacro_perc_{sys2_name}: {macro_perc_sys2}\n' # report += f'bleurt_{sys1_name}: {bleurt_sys1}\n' # report += f'bleurt_{sys2_name}: {bleurt_sys2}\n' print(report) if filepath is not None: with open(filepath, 'wt') as output_file: output_file.write(report)
def get_scores(enc_sources, enc_target_sents, model, device, tokenizersrc, tokenizertrg, search="greedy", n=4): """ takes a list of sentences and their translations in string form and returns score objects model is the trained transformer model tokenizer is the spm sentencpiece vocabulary in the form "name".model search is the decoding strategy, either greedy or beam search n is the beam width in beam search """ model.eval() sp.load(tokenizertrg) targets = [] outputs = [] target_str = [sp.DecodeIds(sent.tolist()) for sent in enc_target_sents] output_str = [] if search == "greedy": x = divide_chunks(enc_sources, 100) output_str = [] for sents in x: print((len(output_str) / len(enc_sources)) * 100, end="\r") y = translate_enc_sentences(model, sents, device, tokenizertrg, max_length=150) output_str.extend(y) bleu = sacrebleu.corpus_bleu(output_str, [target_str]) chrf = sacrebleu.corpus_chrf(output_str, [target_str]) ter = sacrebleu.corpus_ter(output_str, [target_str]) return bleu, chrf, ter elif search == "beam": prediction = beam_search(source, device, tokenizersrc, tokenizertrg, n) sp.Load(tokenizertrg) target = sp.DecodeIds(target.tolist()) targets.append([target.split()]) target_str.append(target) outputs.append(prediction.split()) output_str.append(prediction) bleu = sacrebleu.corpus_bleu(output_str, [target_str]) chrf = sacrebleu.corpus_chrf(output_str, [target_str]) ter = sacrebleu.corpus_ter(output_str, [target_str]) return bleu, chrf, ter
def compute_metrics(ref, hyp, hyp_order): refs = [] hyps = [] for id in hyp_order: for segment in hyp[id]: hyps.append(segment) try: for segment in ref[id]: refs.append(segment) except KeyError: sys.stderr.write('Error: there are no references for document' + ' "' + id + '"\n') sys.exit(-1) metrics = [] for n in range(len(hyps)): try: bleu = sacrebleu.corpus_bleu([hyps[n]], [[refs[n]]]) chrf = sacrebleu.corpus_chrf([hyps[n]], [[refs[n]]]) except EOFError: sys.stderr.write('Error: source and reference have different' + ' lengths.\n') sys.exit(-1) metrics.append([bleu.score] + [chrf.score]) return metrics
def test_chrf_keep_whitespace(hypotheses, references, expected_score): score = sacrebleu.corpus_chrf(hypotheses, [references], char_order=6, word_order=0, beta=3, remove_whitespace=False).score assert abs(score - expected_score) < EPSILON
def test_chrf_eff_order(hypotheses, references, expected_score): score = sacrebleu.corpus_chrf(hypotheses, [references], char_order=6, word_order=0, beta=3, eps_smoothing=False).score assert abs(score - expected_score) < EPSILON
def test_chrf_keep_whitespace(hypotheses, references, expected_score): score = sacrebleu.corpus_chrf(hypotheses, references, 6, 3, remove_whitespace=False) assert abs(score - expected_score) < EPSILON
def compute_metrics(hyp_dec_all, ref_dec_all, use_sacrebleu=True, use_torchtext=True, use_ter=False): metrics = {} # Sacrebleu if use_sacrebleu: metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu( hyp_dec_all, [ref_dec_all]).score metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf( hyp_dec_all, [ref_dec_all]).score if use_ter: # Quite slow metrics["sacrebleu_ter"] = sacrebleu.corpus_ter( hyp_dec_all, [ref_dec_all]).score # Torchtext if use_torchtext: m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all], [[x.split(" ")] for x in ref_dec_all]) metrics["torchtext_bleu"] = m_bleu_score * 100 return metrics
def __call__(self, ref_path: str, hyp_path: str) -> float: ref_streams = load_ref_streams(ref_path, detok=True) sys_stream = load_sys_stream(hyp_path, detok=True) chrf3_score = sacrebleu.corpus_chrf(sys_stream, ref_streams, order=6, beta=3, remove_whitespace=True) return np.round(float(chrf3_score.score * 100), 2)
def chrf(hypotheses, references): """ Character F-score from sacrebleu :param hypotheses: list of hypotheses (strings) :param references: list of references (strings) :return: """ return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)
def chrf(hypotheses, references): """ Character F-score from sacrebleu :param hypotheses: :param references: :return: """ return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references)
def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> float: """ Simple wrapper around sacreBLEU's chrF implementation, without tokenization. :param hypotheses: Hypotheses stream. :param references: Reference stream. :return: chrF score as float between 0 and 1. """ return sacrebleu.corpus_chrf(hypotheses, [references]).score
def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> float: """ Simple wrapper around sacreBLEU's chrF implementation, without tokenization. :param hypotheses: Hypotheses stream. :param references: Reference stream. :return: chrF score as float between 0 and 1. """ return sacrebleu.corpus_chrf(hypotheses, references, order=sacrebleu.CHRF_ORDER, beta=sacrebleu.CHRF_BETA, remove_whitespace=True)
def calculate_score_report(sys, ref, score_only): chrf = sacrebleu.corpus_chrf(sys, ref) bleu = sacrebleu.corpus_bleu(sys, ref) prefix = 'BLEU = ' if score_only else '' print('#### Score Report ####') print(chrf) print('{}{}'.format(prefix, bleu.format(score_only=score_only)))
def chrf(items): """chrF++ is a tool for automatic evaluation of machine translation output based on character n-gram precision and recall enhanced with word n-grams. Source: https://github.com/m-popovic/chrF Paper: https://www.aclweb.org/anthology/W15-3049.pdf Higher is better # TODO I think """ refs = list(zip(*items))[0] preds = list(zip(*items))[1] refs, preds = _sacreformat(refs, preds) return sacrebleu.corpus_chrf(preds, refs).score
def chrf(hypotheses, references, remove_whitespace=True): """ Character F-score from sacrebleu :param hypotheses: list of hypotheses (strings) :param references: list of references (strings) :param remove_whitespace: (bool) :return: """ return sacrebleu.corpus_chrf(hypotheses=hypotheses, references=[references], remove_whitespace=remove_whitespace).score
def chrf(hypotheses, references, remove_whitespace=True): """ Character F-score from sacrebleu :param hypotheses: list of hypotheses (strings) :param references: list of references (strings) :param remove_whitespace: (bool) :return: character f-score (0 <= chf <= 1) see Breaking Change in sacrebleu v2.0 """ score = sacrebleu.corpus_chrf(hypotheses=hypotheses, references=[references], remove_whitespace=remove_whitespace).score return score / 100
def evaluate_batch(self, summaries, references, aggregate=True): if aggregate: score = sacrebleu.corpus_chrf(summaries, references, order=self.ncorder, beta=self.beta) score_dict = {"chrf": score.score} return score_dict else: p = Pool(processes=self.n_workers) results = p.starmap(self.evaluate_example, zip(summaries, references)) p.close() return results
def write_evals(writer, experiment, translation, file_path, ref, src): writer = SummaryWriter("runs/{}-{}".format(experiment, translation)) steps = int(translation) output_path = "translations/{}/{}".format(experiment, translation) with open(output_path, "r", encoding="utf-8") as infile: system_output = [x.strip() for x in infile.readlines()] bleu = sacrebleu.corpus_bleu(system_output, [ref]) chrf = sacrebleu.corpus_chrf(system_output, [ref]) rhyme_score, copied, reconstructed = concurrent_score(system_output, languages[experiment], ref, src) print(experiment, translation, bleu.score, rhyme_score, copied, reconstructed) wall = os.stat(file_path).st_mtime writer.add_scalar(experiment + "/CHRF", chrf.score, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/BLEU", bleu.score, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/Rhyme", rhyme_score, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/Copied", copied, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/Reconstructed", reconstructed, global_step=steps, walltime=wall) writer.flush()
def eval_measure(gold, sys, eval_type='bleu'): ''' Evaluation measure This takes in gold labels and system outputs and evaluates their accuracy. It currently supports: * Accuracy (acc), percentage of labels that match * Pearson's correlation coefficient (pearson) * BLEU score (bleu) * BLEU_detok, on detokenized references and translations, with internal tokenization :param gold: the correct labels (reference) :param sys: the system outputs (hypothesis) :param eval_type: The type of evaluation to do (bleu, chrf3, hlepor) ''' if eval_type == EVAL_TYPE_BLEU: # make sure score is 0-based instead of 100-based return corpus_bleu(sys, [gold]).score / 100. elif eval_type == EVAL_TYPE_CHRF3: return corpus_chrf(sys, [gold], beta=3).score elif eval_type == EVAL_TYPE_HLEPOR: return hlepor_score(sys, gold) else: raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
def chrf(self, hypo, groundtruth=None, lc=False): if groundtruth is None: ref = self._refs_for_sacre else: if isinstance(groundtruth[0], str): ref = [groundtruth] else: ref = groundtruth try: chrf = sacrebleu.corpus_chrf([(x.lower() if lc else x) for x in hypo], [[(x.lower() if lc else x) for x in y] for y in ref]) return chrf.score except IndexError: logging.info("Found empty lines.") print(traceback.format_exc()) return 0. except ZeroDivisionError: logging.info("Empty reference") print(traceback.format_exc()) return 0.
def validate(model, test_data, golden_file, beam_size=8, alpha=0.6, max_time_step=100): """For development Only""" pp = PostProcess() ref_stream = [] for line in open(golden_file + '.input_clean'): if line.startswith('# ::tokens '): o = json.loads(line[len('# ::tokens '):].strip()) ref_stream.append(' '.join(o).lower()) # gold model output graph, gold_sys_stream, _, abstract = read_file(golden_file + '.preproc') ref_streams = [ref_stream] sys_stream = [] for batch in test_data: res = generate_batch(model, batch, beam_size, alpha, max_time_step) sys_stream.extend(res['token']) assert len(sys_stream) == len(ref_stream) sys_stream = [ pp.post_process(o, abstract[i], graph[i]) for i, o in enumerate(sys_stream) ] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=True, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) return bleu, chrf
def validate(model, test_data, beam_size=8, alpha=0.6, max_time_step=100): """For development Only""" pp = PostProcess() ref_stream = [] sys_stream = [] for batch in test_data: res = generate_batch(model, batch, beam_size, alpha, max_time_step) sys_stream.extend(res['token']) ref_stream.extend(batch['target']) assert len(sys_stream) == len(ref_stream) sys_stream = [pp.post_process(o) for o in sys_stream] ref_stream = [' '.join(o) for i in ref_stream] ref_streams = [ref_stream] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=False, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) return bleu, chrf
def score_corpus_multiprocess(self, hypothesis: List[str], references: List[List[str]]) -> float: if self.n_workers == 1: corpus_score = sb.corpus_chrf(hypothesis, references[0]).score else: batches = list( self._batch(hypothesis, references, n_batches=self.n_workers)) corpus_statistics = [0 for _ in range(sb.CHRF_ORDER * 3)] with ProcessPoolExecutor(max_workers=self.n_workers) as executor: futures = [ executor.submit(sb.get_corpus_statistics, b[0], b[1][0]) for b in batches ] progress = as_completed(futures) if self.verbose: progress = tqdm(progress) for future in progress: stats = future.result() for i in range(sb.CHRF_ORDER * 3): corpus_statistics[i] += stats[i] avg_precision, avg_recall = sb._avg_precision_and_recall( corpus_statistics, sb.CHRF_ORDER) corpus_score = sb._chrf(avg_precision, avg_recall) return corpus_score
def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> float: return sacrebleu.corpus_chrf(hypotheses, references, order=sacrebleu.CHRF_ORDER, beta=sacrebleu.CHRF_BETA, remove_whitespace=True)
def test_chrf(hypotheses, references, expected_score): score = sacrebleu.corpus_chrf(hypotheses, [references], 6, 3).score assert abs(score - expected_score) < EPSILON
# # sys = ['The dog runs home hi.', 'the dog runs home hi.'] # ref = ['The dog ran home hi.', 'the dog runs home hello.'] # mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True) # mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False) # print(mf1.score) # print(mf1_old.score) sys = ['The dog runs home now.'] ref = ['The dog ran home now.'] mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True) mf1_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True, measure_name='f1') mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False) mf1_old_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False, measure_name='f1') chrf = sacrebleu.corpus_chrf(sys, ref) print("sys:", sys) print("ref:", ref) print("mf1 new:", mf1.score) print("mf1 f1 new:", mf1_f1.score) print("mf1 old:", mf1_old.score) print("mf1 f1 old:", mf1_old_f1.score) print("chrf:", chrf) print("-------------") sys = ['The dog runs home now.'] ref = ['The dog runs home later.'] mf1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True) mf1_f1 = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=True, measure_name='f1') mf1_old = sacrebleu.corpus_rebleu2(sys, [ref], average='macro', word_class=False)
prev = [' '.join(o) for o in pred_sys_stream] # choose one (gold or pred) and postprocess sys_stream = pred_sys_stream sys_stream = [ pp.post_process(o, abstract[i], graph[i]) for i, o in enumerate(sys_stream) ] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=True, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) all_sent_chrf = [ sacrebleu.sentence_chrf(x, y) for x, y in zip(sys_stream, ref_stream) ] avg_sent_chrf = sum(all_sent_chrf) / len(all_sent_chrf) if args.output: with open(args.pred_file + '.final', 'w') as fo: for x in sys_stream: fo.write(x + '\n') with open(args.pred_file + '.ref', 'w') as fo: for x in ref_stream: fo.write(x + '\n') print(avg_sent_chrf) print(bleu, chrf)
def score(self, src: List[str], cand: List[str], ref: List[str]) -> chrFResult: chrf = sacrebleu.corpus_chrf(cand, [ref]) return chrFResult(chrf.score / 100, [], src, cand, ref, self.name)
def score_individual_books( book_dict: dict, src_iso: str, predictions_detok_path: str, scorers: Set[str], config: Config, ref_projects: Set[str], ): overall_sys: List[str] = [] book_scores: List[PairScore] = [] for book in book_dict.keys(): for trg_iso, book_tuple in book_dict[book].items(): pair_sys = book_tuple[0] pair_refs = book_tuple[1] overall_sys.extend(pair_sys) bleu_score = None if "bleu" in scorers: bleu_score = sacrebleu.corpus_bleu( pair_sys, pair_refs, lowercase=True, tokenize=config.data.get("sacrebleu_tokenize", "13a"), ) if "sentencebleu" in scorers: write_sentence_bleu( predictions_detok_path, pair_sys, pair_refs, lowercase=True, tokenize=config.data.get("sacrebleu_tokenize", "13a"), ) other_scores: Dict[str, float] = {} if "chrf3" in scorers: chrf3_score = sacrebleu.corpus_chrf(pair_sys, pair_refs, order=6, beta=3, remove_whitespace=True) other_scores["CHRF3"] = np.round( float(chrf3_score.score * 100), 2) if "meteor" in scorers: meteor_score = compute_meteor_score(trg_iso, pair_sys, pair_refs) if meteor_score is not None: other_scores["METEOR"] = meteor_score if "wer" in scorers: wer_score = compute_wer_score(pair_sys, cast(List[str], pair_refs)) if wer_score >= 0: other_scores["WER"] = wer_score if "ter" in scorers: ter_score = compute_ter_score(pair_sys, pair_refs) if ter_score >= 0: other_scores["TER"] = ter_score score = PairScore(book, src_iso, trg_iso, bleu_score, len(pair_sys), ref_projects, other_scores) book_scores.append(score) return book_scores
def test_checkpoint( config: Config, force_infer: bool, by_book: bool, ref_projects: Set[str], checkpoint_path: Path, step: int, scorers: Set[str], books: Set[int], ) -> List[PairScore]: config.set_seed() vref_paths: List[str] = [] features_file_names: List[str] = [] predictions_file_names: List[str] = [] refs_patterns: List[str] = [] predictions_detok_file_names: List[str] = [] suffix_str = "_".join(map(lambda n: book_number_to_id(n), sorted(books))) if len(suffix_str) > 0: suffix_str += "-" suffix_str += "avg" if step == -1 else str(step) features_file_name = "test.src.txt" if (config.exp_dir / features_file_name).is_file(): # all test data is stored in a single file vref_paths.append("test.vref.txt") features_file_names.append(features_file_name) predictions_file_names.append(f"test.trg-predictions.txt.{suffix_str}") refs_patterns.append("test.trg.detok*.txt") predictions_detok_file_names.append( f"test.trg-predictions.detok.txt.{suffix_str}") else: # test data is split into separate files for src_iso in sorted(config.src_isos): for trg_iso in sorted(config.trg_isos): if src_iso == trg_iso: continue prefix = f"test.{src_iso}.{trg_iso}" features_file_name = f"{prefix}.src.txt" if (config.exp_dir / features_file_name).is_file(): vref_paths.append(f"{prefix}.vref.txt") features_file_names.append(features_file_name) predictions_file_names.append( f"{prefix}.trg-predictions.txt.{suffix_str}") refs_patterns.append(f"{prefix}.trg.detok*.txt") predictions_detok_file_names.append( f"{prefix}.trg-predictions.detok.txt.{suffix_str}") checkpoint_name = "averaged checkpoint" if step == -1 else f"checkpoint {step}" features_paths: List[Union[str, List[str]]] = [] predictions_paths: List[str] = [] for i in range(len(predictions_file_names)): predictions_path = config.exp_dir / predictions_file_names[i] if force_infer or not predictions_path.is_file(): features_path = config.exp_dir / features_file_names[i] vref_path = config.exp_dir / vref_paths[i] if vref_path.is_file(): features_paths.append([str(features_path), str(vref_path)]) else: features_paths.append(str(features_path)) predictions_paths.append(str(predictions_path)) if len(predictions_paths) > 0: runner = create_runner(config) print(f"Inferencing {checkpoint_name}...") runner.infer_multiple(features_paths, predictions_paths, checkpoint_path=str(checkpoint_path)) print(f"Scoring {checkpoint_name}...") default_src_iso = config.default_src_iso scores: List[PairScore] = [] overall_sys: List[str] = [] overall_refs: List[List[str]] = [] for vref_file_name, features_file_name, predictions_file_name, refs_pattern, predictions_detok_file_name in zip( vref_paths, features_file_names, predictions_file_names, refs_patterns, predictions_detok_file_names): src_iso = default_src_iso if features_file_name != "test.src.txt": src_iso = features_file_name.split(".")[1] dataset, book_dict = load_test_data( vref_file_name, features_file_name, predictions_file_name, refs_pattern, predictions_detok_file_name, ref_projects, config, books, by_book, ) for trg_iso, (pair_sys, pair_refs) in dataset.items(): start_index = len(overall_sys) overall_sys.extend(pair_sys) for i, ref in enumerate(pair_refs): if i == len(overall_refs): overall_refs.append([""] * start_index) overall_refs[i].extend(ref) # ensure that all refs are the same length as the sys for overall_ref in filter(lambda r: len(r) < len(overall_sys), overall_refs): overall_ref.extend([""] * (len(overall_sys) - len(overall_ref))) bleu_score = None if "bleu" in scorers: bleu_score = sacrebleu.corpus_bleu( pair_sys, cast(List[Iterable[str]], pair_refs), lowercase=True, tokenize=config.data.get("sacrebleu_tokenize", "13a"), ) if "sentencebleu" in scorers: write_sentence_bleu( predictions_detok_file_name, pair_sys, cast(List[List[str]], pair_refs), lowercase=True, tokenize=config.data.get("sacrebleu_tokenize", "13a"), ) other_scores: Dict[str, float] = {} if "chrf3" in scorers: chrf3_score = sacrebleu.corpus_chrf(pair_sys, cast( List[Iterable[str]], pair_refs), order=6, beta=3, remove_whitespace=True) other_scores["CHRF3"] = np.round( float(chrf3_score.score * 100), 2) if "meteor" in scorers: meteor_score = compute_meteor_score( trg_iso, pair_sys, cast(List[Iterable[str]], pair_refs)) if meteor_score is not None: other_scores["METEOR"] = meteor_score if "wer" in scorers: wer_score = compute_wer_score(pair_sys, cast(List[str], pair_refs)) if wer_score >= 0: other_scores["WER"] = wer_score if "ter" in scorers: ter_score = compute_ter_score( pair_sys, cast(List[Iterable[str]], pair_refs)) if ter_score >= 0: other_scores["TER"] = ter_score scores.append( PairScore("ALL", src_iso, trg_iso, bleu_score, len(pair_sys), ref_projects, other_scores)) if by_book is True: if len(book_dict) != 0: book_scores = score_individual_books( book_dict, src_iso, predictions_detok_file_name, scorers, config, ref_projects) scores.extend(book_scores) else: print( "Error: book_dict did not load correctly. Not scoring individual books." ) if len(config.src_isos) > 1 or len(config.trg_isos) > 1: bleu = sacrebleu.corpus_bleu(overall_sys, cast(List[Iterable[str]], overall_refs), lowercase=True) scores.append( PairScore("ALL", "ALL", "ALL", bleu, len(overall_sys), ref_projects)) scores_file_root = f"scores-{suffix_str}" if len(ref_projects) > 0: ref_projects_suffix = "_".join(sorted(ref_projects)) scores_file_root += f"-{ref_projects_suffix}" with (config.exp_dir / f"{scores_file_root}.csv").open( "w", encoding="utf-8") as scores_file: if scores is not None: scores[0].writeHeader(scores_file) for results in scores: results.write(scores_file) return scores