Пример #1
0
def multiple_systems_report(
    test_set,
    sys_sents_paths,
    orig_sents_path=None,
    refs_sents_paths=None,
    report_path='easse_report.html',
    tokenizer='13a',
    lowercase=True,
    metrics=DEFAULT_METRICS,
    system_names=None,
):
    '''
    Create a HTML report file comparing multiple systems with automatic metrics, plots and samples.
    '''
    sys_sents_list = [read_lines(path) for path in sys_sents_paths]
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)
    if system_names is None:
        system_names = [Path(path).name for path in sys_sents_paths]
    write_multiple_systems_html_report(
        report_path,
        orig_sents,
        sys_sents_list,
        refs_sents,
        system_names=system_names,
        test_set=test_set,
        lowercase=lowercase,
        tokenizer=tokenizer,
        metrics=metrics,
    )
Пример #2
0
def test_corpus_sari():
    orig_sents = get_orig_sents('turkcorpus_test')
    refs_sents = get_refs_sents('turkcorpus_test')
    system_outputs_dir = get_system_outputs_dir('turkcorpus_test')
    hyp_sents = read_lines(system_outputs_dir / "ACCESS")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents)
    assert sari_score == pytest.approx(
        41.381013)  # Scores from MUSS https://arxiv.org/abs/2005.00352
Пример #3
0
def test_corpus_sari_plain():
    orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm")
    ref_sents = []
    for n in range(8):
        ref_lines = read_lines(DATA_DIR /
                               f"test_sets/turk/test.8turkers.tok.turk.{n}")
        ref_sents.append(ref_lines)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress-Ls.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(36.73586275692667)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(36.5859900146575)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/EncDecA.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(34.73946658449856)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Hybrid.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(31.008109926854227)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/PBMT-R.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(37.817966679481013)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/SBMT-SARI.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(39.360477024519125)
Пример #4
0
def get_sys_sents(test_set, sys_sents_path=None):
    # Get system sentences to be evaluated
    if sys_sents_path is not None:
        return read_lines(sys_sents_path)
    else:
        # read the system output
        with click.get_text_stream('stdin',
                                   encoding='utf-8') as system_output_file:
            return system_output_file.read().splitlines()
Пример #5
0
def get_orig_and_refs_sents(test_set,
                            orig_sents_path=None,
                            refs_sents_paths=None):
    # Get original and reference sentences
    if test_set == 'custom':
        assert orig_sents_path is not None
        assert refs_sents_paths is not None
        if type(refs_sents_paths) == str:
            refs_sents_paths = refs_sents_paths.split(',')
        orig_sents = read_lines(orig_sents_path)
        refs_sents = [
            read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths
        ]
    else:
        orig_sents = get_orig_sents(test_set)
        refs_sents = get_refs_sents(test_set)
    # Final checks
    assert all([len(orig_sents) == len(ref_sents) for ref_sents in refs_sents])
    return orig_sents, refs_sents
Пример #6
0
def get_orig_and_refs_sents(test_set,
                            orig_sents_path=None,
                            refs_sents_paths=None):
    # Get original and reference sentences
    if test_set == "custom":
        assert orig_sents_path is not None
        assert refs_sents_paths is not None
        if type(refs_sents_paths) == str:
            refs_sents_paths = refs_sents_paths.split(",")
        orig_sents = read_lines(orig_sents_path)
        refs_sents = [
            read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths
        ]
    else:
        orig_sents = get_orig_sents(test_set)
        refs_sents = get_refs_sents(test_set)
    # Final checks
    assert all(
        [len(orig_sents) == len(ref_sents) for ref_sents in refs_sents]
    ), f'Not same number of lines for test_set={test_set}, orig_sents_path={orig_sents_path}, refs_sents_paths={refs_sents_paths}'  # noqa: E501
    return orig_sents, refs_sents
Пример #7
0
def get_sents(test_set, orig_sents_path=None, sys_sents_path=None, refs_sents_paths=None):
    if sys_sents_path is not None:
        sys_sents = read_lines(sys_sents_path)
    else:
        # read the system output
        with click.get_text_stream('stdin', encoding='utf-8') as system_output_file:
            sys_sents = system_output_file.read().splitlines()

    if type(refs_sents_paths) == str:
        refs_sents_paths = refs_sents_paths.split(',')

    if test_set != 'custom':
        assert orig_sents_path is None
        assert refs_sents_paths is None
        orig_sents_path = TEST_SETS_PATHS[(test_set, 'orig')]
        refs_sents_paths = TEST_SETS_PATHS[(test_set, 'refs')]
    assert orig_sents_path is not None
    assert refs_sents_paths is not None
    orig_sents = read_lines(orig_sents_path)
    refs_sents = [read_lines(ref_sents_path) for ref_sents_path in refs_sents_paths]
    assert len(sys_sents) == len(orig_sents)
    assert all([len(sys_sents) == len(ref_sents) for ref_sents in refs_sents])
    return orig_sents, sys_sents, refs_sents
Пример #8
0
def test_corpus_sari_tokenize():
    orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm")
    ref_sents = []
    for n in range(8):
        ref_lines = read_lines(DATA_DIR /
                               f"test_sets/turk/test.8turkers.tok.turk.{n}")
        ref_sents.append(ref_lines)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress-Ls.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(37.266058818588216)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(37.08210095744638)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/EncDecA.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(35.65754396121206)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Hybrid.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(31.39665078989411)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/PBMT-R.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(38.558843050332037)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/SBMT-SARI.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(39.964857928109127)
Пример #9
0
def test_corpus_sari_legacy():
    orig_sents = get_orig_sents('turkcorpus_test_legacy')
    refs_sents = get_refs_sents('turkcorpus_test_legacy')
    system_outputs_dir = get_system_outputs_dir('turkcorpus_test')

    hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress-Ls.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(37.266058818588216)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(37.08210095744638)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/EncDecA.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(35.65754396121206)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/Hybrid.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(31.39665078989411)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/PBMT-R.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(38.558843050332037)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/SBMT-SARI.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(39.964857928109127)
Пример #10
0
def report(test_set, input_path=None, report_path='report.html', tokenizer='13a', metrics=','.join(DEFAULT_METRICS)):
    """
    Create a HTML report file with automatic metrics, plots and samples.
    """
    if input_path is not None:
        sys_output = read_lines(input_path)
    else:
        # read the system output
        with click.get_text_stream('stdin', encoding='utf-8') as system_output_file:
            sys_output = system_output_file.read().splitlines()
    if test_set in ['turk', 'turk_valid']:
        lowercase = False
        phase = 'test' if test_set == 'turk' else 'valid'
        refs_sents = get_turk_refs_sents(phase=phase)
        orig_sents = get_turk_orig_sents(phase=phase)
    if test_set == 'hsplit':
        sys_output = sys_output[:70]
        lowercase = True
        refs_sents = get_hsplit_refs_sents()
        orig_sents = get_hsplit_orig_sents()
    write_html_report(
            report_path, orig_sents, sys_output, refs_sents, test_set_name=test_set,
            lowercase=lowercase, tokenizer=tokenizer, metrics=metrics,
            )
Пример #11
0
def evaluate_system_output(
        test_set,
        input_path=None,
        tokenizer='13a',
        metrics=','.join(VALID_METRICS),
        analysis=False,
        quality_estimation=False,
        ):
    """
    Evaluate a system output with automatic metrics.
    """
    if input_path is not None:
        sys_output = read_lines(input_path)
    else:
        # read the system output
        with click.get_text_stream('stdin', encoding='utf-8') as system_output_file:
            sys_output = system_output_file.read().splitlines()

    # get the metrics that need to be computed
    metrics = metrics.split(',')

    load_orig_sents = ('sari' in metrics) or ('samsa' in metrics) or analysis or quality_estimation
    load_refs_sents = ('sari' in metrics) or ('bleu' in metrics) or analysis
    # get the references from the test set
    if test_set in ['turk', 'turk_valid']:
        lowercase = False
        phase = 'test' if test_set == 'turk' else 'valid'
        if load_orig_sents:
            orig_sents = get_turk_orig_sents(phase=phase)
        if load_refs_sents:
            refs_sents = get_turk_refs_sents(phase=phase)

    if test_set in ['pwkp', 'pwkp_valid']:
        lowercase = True
        phase = 'test' if test_set == 'pwkp' else 'valid'
        if load_orig_sents:
            orig_sents = get_pwkp_orig_sents(phase=phase)
        if load_refs_sents:
            refs_sents = get_pwkp_refs_sents(phase=phase)

    if test_set == 'hsplit':
        sys_output = sys_output[:70]
        lowercase = True
        if load_orig_sents:
            orig_sents = get_hsplit_orig_sents()
        if load_refs_sents:
            refs_sents = get_hsplit_refs_sents()

    if load_orig_sents:
        assert len(sys_output) == len(orig_sents)
    if load_refs_sents:
        assert len(sys_output) == len(refs_sents[0])

    # compute each metric
    if 'bleu' in metrics:
        bleu_score = sacrebleu.corpus_bleu(sys_output, refs_sents,
                                           force=True, tokenize=tokenizer, lowercase=lowercase).score
        click.echo(f'BLEU: {bleu_score:.2f}')

    if 'sari' in metrics:
        sari_score = corpus_sari(orig_sents, sys_output, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
        click.echo(f'SARI: {sari_score:.2f}')

    if 'samsa' in metrics:
        samsa_score = corpus_samsa(orig_sents, sys_output, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
        click.echo(f'SAMSA: {samsa_score:.2f}')

    if 'fkgl' in metrics:
        fkgl_score = corpus_fkgl(sys_output, tokenizer=tokenizer)
        click.echo(f'FKGL: {fkgl_score:.2f}')

    if analysis:
        word_level_analysis = corpus_analyse_operations(orig_sents, sys_output, refs_sents,
                                                        verbose=False, as_str=True)
        click.echo(f'Word-level Analysis: {word_level_analysis}')

    if quality_estimation:
        quality_estimation_scores = corpus_quality_estimation(
                orig_sents,
                sys_output,
                tokenizer=tokenizer,
                lowercase=lowercase
                )
        quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()}
        click.echo(f'Quality estimation: {quality_estimation_scores}')
Пример #12
0
def get_turk_orig_sents(phase):
    assert phase in ['valid', 'test']
    if phase == 'valid':
        phase = 'tune'
    return read_lines(DATA_DIR / f'test_sets/turk/{phase}.8turkers.tok.norm')
Пример #13
0
def get_turk_refs_sents(phase):
    assert phase in ['valid', 'test']
    if phase == 'valid':
        phase = 'tune'
    return [read_lines(DATA_DIR / f'test_sets/turk/{phase}.8turkers.tok.turk.{i}')
            for i in range(8)]
Пример #14
0
def get_orig_sents(test_set):
    test_set = maybe_map_deprecated_test_set_to_new_test_set(test_set)
    return read_lines(TEST_SETS_PATHS[(test_set, 'orig')])
Пример #15
0
def get_refs_sents(test_set):
    test_set = maybe_map_deprecated_test_set_to_new_test_set(test_set)
    return [
        read_lines(ref_sents_path)
        for ref_sents_path in TEST_SETS_PATHS[(test_set, 'refs')]
    ]
Пример #16
0
def get_hsplit_refs_sents():
    return [read_lines(DATA_DIR / f'test_sets/hsplit/hsplit.tok.{i+1}')
            for i in range(4)]
Пример #17
0
def get_pwkp_refs_sents(phase):
    assert phase in ['valid', 'test']
    return [read_lines(DATA_DIR / f'test_sets/pwkp/pwkp.{phase}.dst')]
Пример #18
0
def get_pwkp_orig_sents(phase):
    assert phase in ['valid', 'test']
    return read_lines(DATA_DIR / f'test_sets/pwkp/pwkp.{phase}.src')