def get_all_scores( orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]], lowercase: bool = False, tokenizer: str = '13a', metrics: List[str] = DEFAULT_METRICS, ): scores = OrderedDict() if 'bleu' in metrics: scores['BLEU'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score if 'sari' in metrics: scores['SARI'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'samsa' in metrics: from easse.samsa import corpus_samsa scores['SAMSA'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase) if 'fkgl' in metrics: scores['FKGL'] = corpus_fkgl(sys_sents, tokenizer=tokenizer) quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase ) scores = add_dicts( scores, quality_estimation_scores, ) return {key: round(value, 2) for key, value in scores.items()}
def test_corpus_samsa(): orig_sents = get_orig_sents('qats_test') refs_sents = get_refs_sents('qats_test') samsa_score = corpus_samsa(orig_sents, refs_sents[0], lowercase=False, tokenizer='moses') assert samsa_score == pytest.approx(36.94996509406232)
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer='13a', metrics=','.join(DEFAULT_METRICS), analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ # get the metrics that need to be computed metrics = metrics.split(',') orig_sents, sys_sents, refs_sents = get_sents(test_set, orig_sents_path, sys_sents_path, refs_sents_paths) lowercase = is_test_set_lowercase(test_set) # compute each metric if 'bleu' in metrics: bleu_score = sacrebleu.corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score click.echo(f'BLEU: {bleu_score:.2f}') if 'sari' in metrics: sari_score = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) click.echo(f'SARI: {sari_score:.2f}') if 'samsa' in metrics: samsa_score = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase) click.echo(f'SAMSA: {samsa_score:.2f}') if 'fkgl' in metrics: fkgl_score = corpus_fkgl(sys_sents, tokenizer=tokenizer) click.echo(f'FKGL: {fkgl_score:.2f}') if analysis: word_level_analysis = corpus_analyse_operations(orig_sents, sys_sents, refs_sents, verbose=False, as_str=True) click.echo(f'Word-level Analysis: {word_level_analysis}') if quality_estimation: quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase ) quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()} click.echo(f'Quality estimation: {quality_estimation_scores}')
def test_samsa_score_sentence(): # orig_sentence = "You are waiting for a train , this train will take you far away ." # sys_output = "You are waiting for a train . A train that will take you far away ." # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True) # assert samsa_score == pytest.approx(1.0) # orig_sentence = ("The river is indeed an ever-present part of the city's decor , and the official entrance " # "to Lisbon is a broad marble stair mounting from the water to the vast , " # "arcaded Commerce Square ( Praca do Comercio) .") # sys_output = ("The river is indeed an ever-present part of the city's decor , and the entrance to Lisbon " # "is a broad marble stair mounting from the water to the covering , arcaded Commerce " # "Square ( Praca do Comercio) .") # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True) # assert samsa_score == pytest.approx(0.25) # orig_sentence = ("The second largest city of Russia and one of the world's major cities , " # "St . Petersburg has played a vital role in Russian history .") # sys_output = ("The second largest city of Russia and one of the world's major cities , " # "St Petersburg , and has played a vital role in Russian history .") # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True) # assert samsa_score == pytest.approx(0.833333333) # orig_sentence = ("The incident followed the killing in August of five Egyptian security guards by " # "Israeli soldiers pursuing militants who had ambushed and killed eight Israelis " # "along the Israeli-Egyptian border.") # sys_output = ("The incident followed the killing in August by Israeli soldiers. " # "Israeli soldiers pursued militants. " # "Militants had ambushed and killed eight Israelis along the Israeli-Egyptian border.") # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True) # assert samsa_score == pytest.approx(0.71875) # orig_sentence = ("The injured man was able to drive his car to Cloverhill Prison where he got help. " # "He is being treated at Tallaght Hospital but his injuries are not thought to be life-threatening.") # sys_output = "The injured man drive his car to Cloverhill Prison he got help." # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output]) # assert samsa_score == pytest.approx(0.2222222222222222) orig_sentence = ( "for example , king bhumibol was born on monday , " "so on his birthday throughout thailand will be decorated with yellow color ." ) sys_output = ( "for example , king bhumibol was born on monday , " "so on his birthday throughout thailand will be decorated with yellow color ." ) samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=False) print(samsa_score)
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer='13a', lowercase=True, metrics=DEFAULT_METRICS, analysis=False, quality_estimation=False, ): ''' Evaluate a system output with automatic metrics. ''' sys_sents = get_sys_sents(test_set, sys_sents_path) orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) # compute each metric metrics_scores = {} if 'bleu' in metrics: metrics_scores['bleu'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenizer=tokenizer, lowercase=lowercase) if 'sent_bleu' in metrics: metrics_scores['sent_bleu'] = corpus_averaged_sentence_bleu( sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'sari' in metrics: metrics_scores['sari'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'sari_legacy' in metrics: metrics_scores['sari_legacy'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, legacy=True) if 'samsa' in metrics: from easse.samsa import corpus_samsa metrics_scores['samsa'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase, verbose=True) if 'fkgl' in metrics: metrics_scores['fkgl'] = corpus_fkgl(sys_sents, tokenizer=tokenizer) if 'f1_token' in metrics: metrics_scores['f1_token'] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: metrics_scores['word_level_analysis'] = corpus_analyse_operations( orig_sents, sys_sents, refs_sents, verbose=False, as_str=True) if quality_estimation: metrics_scores['quality_estimation'] = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) return metrics_scores
def evaluate_system_output( test_set, sys_sents_path=None, orig_sents_path=None, refs_sents_paths=None, tokenizer="13a", lowercase=True, metrics=DEFAULT_METRICS, analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ for metric in metrics: assert metric in VALID_METRICS, f'"{metric}" not a valid metric. Valid metrics: {VALID_METRICS}' sys_sents = get_sys_sents(test_set, sys_sents_path) orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) # compute each metric metrics_scores = {} if "bleu" in metrics: metrics_scores["bleu"] = corpus_bleu( sys_sents, refs_sents, force=True, tokenizer=tokenizer, lowercase=lowercase, ) if "sent_bleu" in metrics: metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu( sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if "sari" in metrics: metrics_scores["sari"] = corpus_sari( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, ) if "sari_legacy" in metrics: metrics_scores["sari_legacy"] = corpus_sari( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, legacy=True, ) if "sari_by_operation" in metrics: ( metrics_scores["sari_add"], metrics_scores["sari_keep"], metrics_scores["sari_del"], ) = get_corpus_sari_operation_scores( orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase, ) if "samsa" in metrics: from easse.samsa import corpus_samsa metrics_scores["samsa"] = corpus_samsa( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase, verbose=True, ) if "fkgl" in metrics: metrics_scores["fkgl"] = corpus_fkgl(sys_sents, tokenizer=tokenizer) if "f1_token" in metrics: metrics_scores["f1_token"] = corpus_f1_token(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if "bertscore" in metrics: from easse.bertscore import corpus_bertscore # Inline import to use EASSE without installing all dependencies ( metrics_scores["bertscore_precision"], metrics_scores["bertscore_recall"], metrics_scores["bertscore_f1"], ) = corpus_bertscore(sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if analysis: from easse.annotation.word_level import WordOperationAnnotator # Inline import to use EASSE without installing all dependencies word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer, lowercase=lowercase, verbose=True) metrics_scores[ "word_level_analysis"] = word_operation_annotator.analyse_operations( orig_sents, sys_sents, refs_sents, as_str=True) if quality_estimation: metrics_scores["quality_estimation"] = corpus_quality_estimation( orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) return metrics_scores
def evaluate_system_output( test_set, input_path=None, tokenizer='13a', metrics=','.join(VALID_METRICS), analysis=False, quality_estimation=False, ): """ Evaluate a system output with automatic metrics. """ if input_path is not None: sys_output = read_lines(input_path) else: # read the system output with click.get_text_stream('stdin', encoding='utf-8') as system_output_file: sys_output = system_output_file.read().splitlines() # get the metrics that need to be computed metrics = metrics.split(',') load_orig_sents = ('sari' in metrics) or ('samsa' in metrics) or analysis or quality_estimation load_refs_sents = ('sari' in metrics) or ('bleu' in metrics) or analysis # get the references from the test set if test_set in ['turk', 'turk_valid']: lowercase = False phase = 'test' if test_set == 'turk' else 'valid' if load_orig_sents: orig_sents = get_turk_orig_sents(phase=phase) if load_refs_sents: refs_sents = get_turk_refs_sents(phase=phase) if test_set in ['pwkp', 'pwkp_valid']: lowercase = True phase = 'test' if test_set == 'pwkp' else 'valid' if load_orig_sents: orig_sents = get_pwkp_orig_sents(phase=phase) if load_refs_sents: refs_sents = get_pwkp_refs_sents(phase=phase) if test_set == 'hsplit': sys_output = sys_output[:70] lowercase = True if load_orig_sents: orig_sents = get_hsplit_orig_sents() if load_refs_sents: refs_sents = get_hsplit_refs_sents() if load_orig_sents: assert len(sys_output) == len(orig_sents) if load_refs_sents: assert len(sys_output) == len(refs_sents[0]) # compute each metric if 'bleu' in metrics: bleu_score = sacrebleu.corpus_bleu(sys_output, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score click.echo(f'BLEU: {bleu_score:.2f}') if 'sari' in metrics: sari_score = corpus_sari(orig_sents, sys_output, refs_sents, tokenizer=tokenizer, lowercase=lowercase) click.echo(f'SARI: {sari_score:.2f}') if 'samsa' in metrics: samsa_score = corpus_samsa(orig_sents, sys_output, tokenizer=tokenizer, verbose=True, lowercase=lowercase) click.echo(f'SAMSA: {samsa_score:.2f}') if 'fkgl' in metrics: fkgl_score = corpus_fkgl(sys_output, tokenizer=tokenizer) click.echo(f'FKGL: {fkgl_score:.2f}') if analysis: word_level_analysis = corpus_analyse_operations(orig_sents, sys_output, refs_sents, verbose=False, as_str=True) click.echo(f'Word-level Analysis: {word_level_analysis}') if quality_estimation: quality_estimation_scores = corpus_quality_estimation( orig_sents, sys_output, tokenizer=tokenizer, lowercase=lowercase ) quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()} click.echo(f'Quality estimation: {quality_estimation_scores}')