Пример #1
0
def get_all_scores(
        orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]],
        lowercase: bool = False, tokenizer: str = '13a', metrics: List[str] = DEFAULT_METRICS,
        ):
    scores = OrderedDict()
    if 'bleu' in metrics:
        scores['BLEU'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score
    if 'sari' in metrics:
        scores['SARI'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
    if 'samsa' in metrics:
        from easse.samsa import corpus_samsa
        scores['SAMSA'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
    if 'fkgl' in metrics:
        scores['FKGL'] = corpus_fkgl(sys_sents, tokenizer=tokenizer)
    quality_estimation_scores = corpus_quality_estimation(
            orig_sents,
            sys_sents,
            tokenizer=tokenizer,
            lowercase=lowercase
            )
    scores = add_dicts(
            scores,
            quality_estimation_scores,
            )
    return {key: round(value, 2) for key, value in scores.items()}
Пример #2
0
def test_corpus_samsa():
    orig_sents = get_orig_sents('qats_test')
    refs_sents = get_refs_sents('qats_test')
    samsa_score = corpus_samsa(orig_sents,
                               refs_sents[0],
                               lowercase=False,
                               tokenizer='moses')
    assert samsa_score == pytest.approx(36.94996509406232)
Пример #3
0
def evaluate_system_output(
        test_set,
        sys_sents_path=None,
        orig_sents_path=None,
        refs_sents_paths=None,
        tokenizer='13a',
        metrics=','.join(DEFAULT_METRICS),
        analysis=False,
        quality_estimation=False,
        ):
    """
    Evaluate a system output with automatic metrics.
    """
    # get the metrics that need to be computed
    metrics = metrics.split(',')
    orig_sents, sys_sents, refs_sents = get_sents(test_set, orig_sents_path, sys_sents_path, refs_sents_paths)
    lowercase = is_test_set_lowercase(test_set)

    # compute each metric
    if 'bleu' in metrics:
        bleu_score = sacrebleu.corpus_bleu(sys_sents, refs_sents,
                                           force=True, tokenize=tokenizer, lowercase=lowercase).score
        click.echo(f'BLEU: {bleu_score:.2f}')

    if 'sari' in metrics:
        sari_score = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
        click.echo(f'SARI: {sari_score:.2f}')

    if 'samsa' in metrics:
        samsa_score = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
        click.echo(f'SAMSA: {samsa_score:.2f}')

    if 'fkgl' in metrics:
        fkgl_score = corpus_fkgl(sys_sents, tokenizer=tokenizer)
        click.echo(f'FKGL: {fkgl_score:.2f}')

    if analysis:
        word_level_analysis = corpus_analyse_operations(orig_sents, sys_sents, refs_sents,
                                                        verbose=False, as_str=True)
        click.echo(f'Word-level Analysis: {word_level_analysis}')

    if quality_estimation:
        quality_estimation_scores = corpus_quality_estimation(
                orig_sents,
                sys_sents,
                tokenizer=tokenizer,
                lowercase=lowercase
                )
        quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()}
        click.echo(f'Quality estimation: {quality_estimation_scores}')
Пример #4
0
def test_samsa_score_sentence():
    # orig_sentence = "You are waiting for a train , this train will take you far away ."
    # sys_output = "You are waiting for a train . A train that will take you far away ."
    # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True)
    # assert samsa_score == pytest.approx(1.0)

    # orig_sentence = ("The river is indeed an ever-present part of the city's decor , and the official entrance "
    #                  "to Lisbon is a broad marble stair mounting from the water to the vast , "
    #                  "arcaded Commerce Square ( Praca do Comercio) .")
    # sys_output = ("The river is indeed an ever-present part of the city's decor , and the entrance to Lisbon "
    #               "is a broad marble stair mounting from the water to the covering , arcaded Commerce "
    #               "Square ( Praca do Comercio) .")
    # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True)
    # assert samsa_score == pytest.approx(0.25)

    # orig_sentence = ("The second largest city of Russia and one of the world's major cities , "
    #                  "St . Petersburg has played a vital role in Russian history .")
    # sys_output = ("The second largest city of Russia and one of the world's major cities , "
    #               "St Petersburg , and has played a vital role in Russian history .")
    # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True)
    # assert samsa_score == pytest.approx(0.833333333)

    # orig_sentence = ("The incident followed the killing in August of five Egyptian security guards by "
    #                  "Israeli soldiers pursuing militants who had ambushed and killed eight Israelis "
    #                  "along the Israeli-Egyptian border.")
    # sys_output = ("The incident followed the killing in August by Israeli soldiers. "
    #               "Israeli soldiers pursued militants. "
    #               "Militants had ambushed and killed eight Israelis along the Israeli-Egyptian border.")
    # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output], lowercase=True)
    # assert samsa_score == pytest.approx(0.71875)

    # orig_sentence = ("The injured man was able to drive his car to Cloverhill Prison where he got help. "
    #                  "He is being treated at Tallaght  Hospital but his injuries are not thought to be life-threatening.")
    # sys_output = "The injured man drive his car to Cloverhill Prison he got help."
    # samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output])
    # assert samsa_score == pytest.approx(0.2222222222222222)

    orig_sentence = (
        "for example , king bhumibol was born on monday , "
        "so on his birthday throughout thailand will be decorated with yellow color ."
    )
    sys_output = (
        "for example , king bhumibol was born on monday , "
        "so on his birthday throughout thailand will be decorated with yellow color ."
    )
    samsa_score = samsa.corpus_samsa([orig_sentence], [sys_output],
                                     lowercase=False)
    print(samsa_score)
Пример #5
0
def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer='13a',
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    '''
    Evaluate a system output with automatic metrics.
    '''
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if 'bleu' in metrics:
        metrics_scores['bleu'] = corpus_bleu(sys_sents,
                                             refs_sents,
                                             force=True,
                                             tokenizer=tokenizer,
                                             lowercase=lowercase)

    if 'sent_bleu' in metrics:
        metrics_scores['sent_bleu'] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if 'sari' in metrics:
        metrics_scores['sari'] = corpus_sari(orig_sents,
                                             sys_sents,
                                             refs_sents,
                                             tokenizer=tokenizer,
                                             lowercase=lowercase)

    if 'sari_legacy' in metrics:
        metrics_scores['sari_legacy'] = corpus_sari(orig_sents,
                                                    sys_sents,
                                                    refs_sents,
                                                    tokenizer=tokenizer,
                                                    lowercase=lowercase,
                                                    legacy=True)

    if 'samsa' in metrics:
        from easse.samsa import corpus_samsa
        metrics_scores['samsa'] = corpus_samsa(orig_sents,
                                               sys_sents,
                                               tokenizer=tokenizer,
                                               lowercase=lowercase,
                                               verbose=True)

    if 'fkgl' in metrics:
        metrics_scores['fkgl'] = corpus_fkgl(sys_sents, tokenizer=tokenizer)

    if 'f1_token' in metrics:
        metrics_scores['f1_token'] = corpus_f1_token(sys_sents,
                                                     refs_sents,
                                                     tokenizer=tokenizer,
                                                     lowercase=lowercase)

    if analysis:
        metrics_scores['word_level_analysis'] = corpus_analyse_operations(
            orig_sents, sys_sents, refs_sents, verbose=False, as_str=True)

    if quality_estimation:
        metrics_scores['quality_estimation'] = corpus_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase)

    return metrics_scores
Пример #6
0
def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer="13a",
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    """
    Evaluate a system output with automatic metrics.
    """
    for metric in metrics:
        assert metric in VALID_METRICS, f'"{metric}" not a valid metric. Valid metrics: {VALID_METRICS}'
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if "bleu" in metrics:
        metrics_scores["bleu"] = corpus_bleu(
            sys_sents,
            refs_sents,
            force=True,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sent_bleu" in metrics:
        metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if "sari" in metrics:
        metrics_scores["sari"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sari_legacy" in metrics:
        metrics_scores["sari_legacy"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            legacy=True,
        )

    if "sari_by_operation" in metrics:
        (
            metrics_scores["sari_add"],
            metrics_scores["sari_keep"],
            metrics_scores["sari_del"],
        ) = get_corpus_sari_operation_scores(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "samsa" in metrics:
        from easse.samsa import corpus_samsa

        metrics_scores["samsa"] = corpus_samsa(
            orig_sents,
            sys_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            verbose=True,
        )

    if "fkgl" in metrics:
        metrics_scores["fkgl"] = corpus_fkgl(sys_sents, tokenizer=tokenizer)

    if "f1_token" in metrics:
        metrics_scores["f1_token"] = corpus_f1_token(sys_sents,
                                                     refs_sents,
                                                     tokenizer=tokenizer,
                                                     lowercase=lowercase)

    if "bertscore" in metrics:
        from easse.bertscore import corpus_bertscore  # Inline import to use EASSE without installing all dependencies
        (
            metrics_scores["bertscore_precision"],
            metrics_scores["bertscore_recall"],
            metrics_scores["bertscore_f1"],
        ) = corpus_bertscore(sys_sents,
                             refs_sents,
                             tokenizer=tokenizer,
                             lowercase=lowercase)

    if analysis:
        from easse.annotation.word_level import WordOperationAnnotator  # Inline import to use EASSE without installing all dependencies
        word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer,
                                                          lowercase=lowercase,
                                                          verbose=True)
        metrics_scores[
            "word_level_analysis"] = word_operation_annotator.analyse_operations(
                orig_sents, sys_sents, refs_sents, as_str=True)

    if quality_estimation:
        metrics_scores["quality_estimation"] = corpus_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase)

    return metrics_scores
Пример #7
0
def evaluate_system_output(
        test_set,
        input_path=None,
        tokenizer='13a',
        metrics=','.join(VALID_METRICS),
        analysis=False,
        quality_estimation=False,
        ):
    """
    Evaluate a system output with automatic metrics.
    """
    if input_path is not None:
        sys_output = read_lines(input_path)
    else:
        # read the system output
        with click.get_text_stream('stdin', encoding='utf-8') as system_output_file:
            sys_output = system_output_file.read().splitlines()

    # get the metrics that need to be computed
    metrics = metrics.split(',')

    load_orig_sents = ('sari' in metrics) or ('samsa' in metrics) or analysis or quality_estimation
    load_refs_sents = ('sari' in metrics) or ('bleu' in metrics) or analysis
    # get the references from the test set
    if test_set in ['turk', 'turk_valid']:
        lowercase = False
        phase = 'test' if test_set == 'turk' else 'valid'
        if load_orig_sents:
            orig_sents = get_turk_orig_sents(phase=phase)
        if load_refs_sents:
            refs_sents = get_turk_refs_sents(phase=phase)

    if test_set in ['pwkp', 'pwkp_valid']:
        lowercase = True
        phase = 'test' if test_set == 'pwkp' else 'valid'
        if load_orig_sents:
            orig_sents = get_pwkp_orig_sents(phase=phase)
        if load_refs_sents:
            refs_sents = get_pwkp_refs_sents(phase=phase)

    if test_set == 'hsplit':
        sys_output = sys_output[:70]
        lowercase = True
        if load_orig_sents:
            orig_sents = get_hsplit_orig_sents()
        if load_refs_sents:
            refs_sents = get_hsplit_refs_sents()

    if load_orig_sents:
        assert len(sys_output) == len(orig_sents)
    if load_refs_sents:
        assert len(sys_output) == len(refs_sents[0])

    # compute each metric
    if 'bleu' in metrics:
        bleu_score = sacrebleu.corpus_bleu(sys_output, refs_sents,
                                           force=True, tokenize=tokenizer, lowercase=lowercase).score
        click.echo(f'BLEU: {bleu_score:.2f}')

    if 'sari' in metrics:
        sari_score = corpus_sari(orig_sents, sys_output, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
        click.echo(f'SARI: {sari_score:.2f}')

    if 'samsa' in metrics:
        samsa_score = corpus_samsa(orig_sents, sys_output, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
        click.echo(f'SAMSA: {samsa_score:.2f}')

    if 'fkgl' in metrics:
        fkgl_score = corpus_fkgl(sys_output, tokenizer=tokenizer)
        click.echo(f'FKGL: {fkgl_score:.2f}')

    if analysis:
        word_level_analysis = corpus_analyse_operations(orig_sents, sys_output, refs_sents,
                                                        verbose=False, as_str=True)
        click.echo(f'Word-level Analysis: {word_level_analysis}')

    if quality_estimation:
        quality_estimation_scores = corpus_quality_estimation(
                orig_sents,
                sys_output,
                tokenizer=tokenizer,
                lowercase=lowercase
                )
        quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()}
        click.echo(f'Quality estimation: {quality_estimation_scores}')