Exemplo n.º 1
0
def evaluate_sentences(comps,
                       simps,
                       simp_preds,
                       calc_simp_pred_bleu=False,
                       calc_comp_simp_bleu=False):

    comps = [sent.lower() for sent in comps]
    if type(simps[0]) == type([]):
        simps = [[sent.lower() for sent in l] for l in simps]
    else:
        simps = [sent for sent in simps]
    simp_preds = [sent.lower() for sent in simp_preds]
    if type(simps[0]) == type([]):
        refs = simps
    else:
        refs = [simps]

    bleu = corpus_bleu(simp_preds,
                       refs,
                       force=True,
                       tokenizer='none',
                       lowercase=True)
    sari = corpus_sari(comps,
                       simp_preds,
                       refs,
                       tokenizer="none",
                       lowercase=True)
    fkgl = corpus_fkgl(simp_preds, tokenizer='none')
    result = (bleu, sari, fkgl)
    if calc_simp_pred_bleu:
        result = result + (corpus_bleu(simp_preds, [comps]), )
    if calc_comp_simp_bleu:
        result = result + (corpus_bleu(comps, refs), )
    return result
Exemplo n.º 2
0
def get_all_scores(
        orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]],
        lowercase: bool = False, tokenizer: str = '13a', metrics: List[str] = DEFAULT_METRICS,
        ):
    scores = OrderedDict()
    if 'bleu' in metrics:
        scores['BLEU'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score
    if 'sari' in metrics:
        scores['SARI'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
    if 'samsa' in metrics:
        from easse.samsa import corpus_samsa
        scores['SAMSA'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
    if 'fkgl' in metrics:
        scores['FKGL'] = corpus_fkgl(sys_sents, tokenizer=tokenizer)
    quality_estimation_scores = corpus_quality_estimation(
            orig_sents,
            sys_sents,
            tokenizer=tokenizer,
            lowercase=lowercase
            )
    scores = add_dicts(
            scores,
            quality_estimation_scores,
            )
    return {key: round(value, 2) for key, value in scores.items()}
Exemplo n.º 3
0
def test_corpus_sari():
    orig_sents = get_orig_sents('turkcorpus_test')
    refs_sents = get_refs_sents('turkcorpus_test')
    system_outputs_dir = get_system_outputs_dir('turkcorpus_test')
    hyp_sents = read_lines(system_outputs_dir / "ACCESS")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, refs_sents)
    assert sari_score == pytest.approx(
        41.381013)  # Scores from MUSS https://arxiv.org/abs/2005.00352
Exemplo n.º 4
0
def compute_sari(source, decoded_sents, refs):
    from easse.sari import corpus_sari
    score_list = []
    for source_sent, decoded, ref in zip(source, decoded_sents, refs):
        decoded = [decoded]
        ref = [[ref]]
        source_sent = [source_sent]
        # print("decoded:", decoded)
        # print("Ref:", ref)
        sari_score = corpus_sari(orig_sents=source_sent, sys_sents=decoded, refs_sents=ref)
        score_list.append(sari_score)
    # sari_score = corpus_sari(orig_sents=refs, sys_sents=decoded_sents, refs_sents=refs)
    score = get_cuda(T.FloatTensor(score_list)).sum() / len(score_list)
    return score
Exemplo n.º 5
0
def evaluate_system_output(
        test_set,
        sys_sents_path=None,
        orig_sents_path=None,
        refs_sents_paths=None,
        tokenizer='13a',
        metrics=','.join(DEFAULT_METRICS),
        analysis=False,
        quality_estimation=False,
        ):
    """
    Evaluate a system output with automatic metrics.
    """
    # get the metrics that need to be computed
    metrics = metrics.split(',')
    orig_sents, sys_sents, refs_sents = get_sents(test_set, orig_sents_path, sys_sents_path, refs_sents_paths)
    lowercase = is_test_set_lowercase(test_set)

    # compute each metric
    if 'bleu' in metrics:
        bleu_score = sacrebleu.corpus_bleu(sys_sents, refs_sents,
                                           force=True, tokenize=tokenizer, lowercase=lowercase).score
        click.echo(f'BLEU: {bleu_score:.2f}')

    if 'sari' in metrics:
        sari_score = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
        click.echo(f'SARI: {sari_score:.2f}')

    if 'samsa' in metrics:
        samsa_score = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
        click.echo(f'SAMSA: {samsa_score:.2f}')

    if 'fkgl' in metrics:
        fkgl_score = corpus_fkgl(sys_sents, tokenizer=tokenizer)
        click.echo(f'FKGL: {fkgl_score:.2f}')

    if analysis:
        word_level_analysis = corpus_analyse_operations(orig_sents, sys_sents, refs_sents,
                                                        verbose=False, as_str=True)
        click.echo(f'Word-level Analysis: {word_level_analysis}')

    if quality_estimation:
        quality_estimation_scores = corpus_quality_estimation(
                orig_sents,
                sys_sents,
                tokenizer=tokenizer,
                lowercase=lowercase
                )
        quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()}
        click.echo(f'Quality estimation: {quality_estimation_scores}')
Exemplo n.º 6
0
def test_corpus_sari_plain():
    orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm")
    ref_sents = []
    for n in range(8):
        ref_lines = read_lines(DATA_DIR /
                               f"test_sets/turk/test.8turkers.tok.turk.{n}")
        ref_sents.append(ref_lines)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress-Ls.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(36.73586275692667)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(36.5859900146575)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/EncDecA.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(34.73946658449856)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Hybrid.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(31.008109926854227)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/PBMT-R.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(37.817966679481013)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/SBMT-SARI.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  ref_sents,
                                  tokenizer='plain')
    assert sari_score == pytest.approx(39.360477024519125)
Exemplo n.º 7
0
def test_corpus_sari_legacy():
    orig_sents = get_orig_sents('turkcorpus_test_legacy')
    refs_sents = get_refs_sents('turkcorpus_test_legacy')
    system_outputs_dir = get_system_outputs_dir('turkcorpus_test')

    hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress-Ls.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(37.266058818588216)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/Dress.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(37.08210095744638)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/EncDecA.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(35.65754396121206)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/Hybrid.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(31.39665078989411)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/PBMT-R.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(38.558843050332037)

    hyp_sents = read_lines(system_outputs_dir / "tok.low/SBMT-SARI.tok.low")
    sari_score = sari.corpus_sari(orig_sents,
                                  hyp_sents,
                                  refs_sents,
                                  legacy=True)
    assert sari_score == pytest.approx(39.964857928109127)
Exemplo n.º 8
0
def get_rf_from_dev(dev_df, preds_dev, max_depth=None, random_state=19):
    preds_df = preds_dev.copy()
    dev_df_grouped = dev_df.groupby("input").agg({
        "output": list,
        "cosine_sim": list,
        "rouge_l": list,
        "input_len": max,
        "output_len": list
    }).reset_index()
    preds_df["ref"] = [
        l for sublist in dev_df_grouped["output"].apply(
            lambda x: [x] * 5).tolist() for l in sublist
    ]
    preds_df["ref"] = preds_df["ref"].apply(lambda x: [[i] for i in x])

    preds_df["pred_len"] = preds_df["pred"].apply(
        lambda x: len(get_word_tokens(x)))
    preds_df["input_len"] = preds_df["input"].apply(
        lambda x: len(get_word_tokens(x)))

    preds_df["sari"] = preds_df.apply(lambda x: corpus_sari(
        orig_sents=[x["input"]],
        sys_sents=[x["pred"]],
        refs_sents=x["ref"],
    ),
                                      axis=1)

    rf = RandomForestRegressor(n_estimators=1000,
                               max_depth=max_depth,
                               n_jobs=-1,
                               random_state=random_state)

    X_train = preds_df[["cosine_sim", "rouge_l", "input_len", "pred_len"]]
    y_train = preds_df["sari"]

    rf.fit(X_train, y_train)

    return rf, preds_df
Exemplo n.º 9
0
def test_corpus_sari_tokenize():
    orig_sents = read_lines(DATA_DIR / "test_sets/turk/test.8turkers.tok.norm")
    ref_sents = []
    for n in range(8):
        ref_lines = read_lines(DATA_DIR /
                               f"test_sets/turk/test.8turkers.tok.turk.{n}")
        ref_sents.append(ref_lines)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress-Ls.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(37.266058818588216)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Dress.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(37.08210095744638)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/EncDecA.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(35.65754396121206)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/Hybrid.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(31.39665078989411)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/PBMT-R.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(38.558843050332037)

    hyp_sents = read_lines(DATA_DIR /
                           "system_outputs/turk/lower/SBMT-SARI.tok.low")
    sari_score = sari.corpus_sari(orig_sents, hyp_sents, ref_sents)
    assert sari_score == pytest.approx(39.964857928109127)
Exemplo n.º 10
0
def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer='13a',
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    '''
    Evaluate a system output with automatic metrics.
    '''
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if 'bleu' in metrics:
        metrics_scores['bleu'] = corpus_bleu(sys_sents,
                                             refs_sents,
                                             force=True,
                                             tokenizer=tokenizer,
                                             lowercase=lowercase)

    if 'sent_bleu' in metrics:
        metrics_scores['sent_bleu'] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if 'sari' in metrics:
        metrics_scores['sari'] = corpus_sari(orig_sents,
                                             sys_sents,
                                             refs_sents,
                                             tokenizer=tokenizer,
                                             lowercase=lowercase)

    if 'sari_legacy' in metrics:
        metrics_scores['sari_legacy'] = corpus_sari(orig_sents,
                                                    sys_sents,
                                                    refs_sents,
                                                    tokenizer=tokenizer,
                                                    lowercase=lowercase,
                                                    legacy=True)

    if 'samsa' in metrics:
        from easse.samsa import corpus_samsa
        metrics_scores['samsa'] = corpus_samsa(orig_sents,
                                               sys_sents,
                                               tokenizer=tokenizer,
                                               lowercase=lowercase,
                                               verbose=True)

    if 'fkgl' in metrics:
        metrics_scores['fkgl'] = corpus_fkgl(sys_sents, tokenizer=tokenizer)

    if 'f1_token' in metrics:
        metrics_scores['f1_token'] = corpus_f1_token(sys_sents,
                                                     refs_sents,
                                                     tokenizer=tokenizer,
                                                     lowercase=lowercase)

    if analysis:
        metrics_scores['word_level_analysis'] = corpus_analyse_operations(
            orig_sents, sys_sents, refs_sents, verbose=False, as_str=True)

    if quality_estimation:
        metrics_scores['quality_estimation'] = corpus_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase)

    return metrics_scores
Exemplo n.º 11
0
def get_qualitative_examples_html(orig_sents, sys_sents, refs_sents):
    title_key_print = [
        ('Randomly sampled simplifications',
         lambda c, s, refs: 0,
         lambda value: ''),
        ('Best simplifications according to SARI',
         lambda c, s, refs: -corpus_sari([c], [s], [refs]),
         lambda value: f'SARI={-value:.2f}'),
        ('Worst simplifications according to SARI',
         lambda c, s, refs: corpus_sari([c], [s], [refs]),
         lambda value: f'SARI={value:.2f}'),
        ('Simplifications with the most compression',
         lambda c, s, refs: get_compression_ratio(c, s),
         lambda value: f'compression_ratio={value:.2f}'),
        ('Simplifications with a high amount of paraphrasing',
         lambda c, s, refs: get_levenshtein_similarity(c, s) / get_compression_ratio(c, s),
         lambda value: f'levenshtein_similarity={value:.2f}'),
        ('Simplifications with the most sentence splits (if any)',
         lambda c, s, refs: -(count_sentences(s) - count_sentences(c)),
         lambda value: f'#sentence_splits={-value:.2f}'),
    ]

    def get_one_sample_html(orig_sent, sys_sent, ref_sents, sort_key, print_func):
        orig_sent, sys_sent, *ref_sents = [html.escape(sent) for sent in [orig_sent, sys_sent, *ref_sents]]
        doc = Doc()
        with doc.tag('div', klass='mb-2 p-1'):
            # Sort key
            with doc.tag('div', klass='text-muted small'):
                doc.asis(print_func(sort_key(orig_sent, sys_sent, ref_sents)))
            with doc.tag('div', klass='ml-2'):
                orig_sent_bold, sys_sent_bold = make_differing_words_bold(orig_sent, sys_sent, make_text_bold_html)
                # Source
                with doc.tag('div'):
                    doc.asis(orig_sent_bold)
                # Prediction
                with doc.tag('div'):
                    doc.asis(sys_sent_bold)
                # References
                collapse_id = get_random_html_id()
                with doc.tag('div', klass='position-relative'):
                    with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'),
                                 klass='stretched-link small'):
                        doc.text('References')
                    with doc.tag('div', klass='collapse', id=collapse_id):
                        for ref_sent in refs:
                            _, ref_sent_bold = make_differing_words_bold(orig_sent, ref_sent, make_text_bold_html)
                            with doc.tag('div', klass='text-muted'):
                                doc.asis(ref_sent_bold)
        return doc.getvalue()

    doc = Doc()
    for title, sort_key, print_func in title_key_print:
        with doc.tag('div', klass='container-fluid mt-4 p-2 border'):
            collapse_id = get_random_html_id()
            with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}')):
                doc.line('h3', klass='m-2', text_content=title)
            # Now lets print the examples
            sample_generator = sorted(
                    zip(orig_sents, sys_sents, zip(*refs_sents)),
                    key=lambda args: sort_key(*args),
            )
            # Samples displayed by default
            with doc.tag('div', klass='collapse', id=collapse_id):
                n_samples = 50
                for i, (orig_sent, sys_sent, refs) in enumerate(sample_generator):
                    if i >= n_samples:
                        break
                    doc.asis(get_one_sample_html(orig_sent, sys_sent, refs, sort_key, print_func))
    return doc.getvalue()
Exemplo n.º 12
0
 def get_relative_sari(orig_sent, sys_sents, refs_sents, system_idx):
     saris = [corpus_sari([orig_sent], [sys_sent], refs_sents) for sys_sent in sys_sents]
     return saris[system_idx] / np.average(saris)
Exemplo n.º 13
0
            row = row.replace("\n", "").replace(",", " ").replace(
                "article: ",
                "").replace("ref: ", "").replace("dec: ", "").replace('"', '')
            pair += row + ","

    test_file.close()
    example_sentences.close()

    ###############################
    file = open(csv_file_name, 'r', encoding='utf-8')
    file2 = open(score_file_name, 'w', encoding='utf-8')

    for i, row in enumerate(file):
        if (i == 0):
            file2.write(
                "article,reference,decoded,rouge1,rouge2,rouge_L,sari\n")
            continue
        row = row.split("\n")[0]
        row = row.split(",")
        rough_score = rg1.get_scores(row[2], row[1])
        sari_score = corpus_sari(orig_sents=[row[0]],
                                 sys_sents=[row[2]],
                                 refs_sents=[[row[1]]])
        pair = row[0] + "," + row[1] + "," + row[2] + "," + str(rough_score[0]['rouge-1']['f']) + "," \
               + str(rough_score[0]['rouge-2']['f']) + "," + \
               str(rough_score[0]['rouge-l']['f']) + "," + str(sari_score) + "\n"
        file2.write(pair)
    print("score file with name", score_file_name, "written into disk")
    file2.close()
    file.close()
Exemplo n.º 14
0
def evaluate_system_output(
    test_set,
    sys_sents_path=None,
    orig_sents_path=None,
    refs_sents_paths=None,
    tokenizer="13a",
    lowercase=True,
    metrics=DEFAULT_METRICS,
    analysis=False,
    quality_estimation=False,
):
    """
    Evaluate a system output with automatic metrics.
    """
    for metric in metrics:
        assert metric in VALID_METRICS, f'"{metric}" not a valid metric. Valid metrics: {VALID_METRICS}'
    sys_sents = get_sys_sents(test_set, sys_sents_path)
    orig_sents, refs_sents = get_orig_and_refs_sents(test_set, orig_sents_path,
                                                     refs_sents_paths)

    # compute each metric
    metrics_scores = {}
    if "bleu" in metrics:
        metrics_scores["bleu"] = corpus_bleu(
            sys_sents,
            refs_sents,
            force=True,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sent_bleu" in metrics:
        metrics_scores["sent_bleu"] = corpus_averaged_sentence_bleu(
            sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase)

    if "sari" in metrics:
        metrics_scores["sari"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "sari_legacy" in metrics:
        metrics_scores["sari_legacy"] = corpus_sari(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            legacy=True,
        )

    if "sari_by_operation" in metrics:
        (
            metrics_scores["sari_add"],
            metrics_scores["sari_keep"],
            metrics_scores["sari_del"],
        ) = get_corpus_sari_operation_scores(
            orig_sents,
            sys_sents,
            refs_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
        )

    if "samsa" in metrics:
        from easse.samsa import corpus_samsa

        metrics_scores["samsa"] = corpus_samsa(
            orig_sents,
            sys_sents,
            tokenizer=tokenizer,
            lowercase=lowercase,
            verbose=True,
        )

    if "fkgl" in metrics:
        metrics_scores["fkgl"] = corpus_fkgl(sys_sents, tokenizer=tokenizer)

    if "f1_token" in metrics:
        metrics_scores["f1_token"] = corpus_f1_token(sys_sents,
                                                     refs_sents,
                                                     tokenizer=tokenizer,
                                                     lowercase=lowercase)

    if "bertscore" in metrics:
        from easse.bertscore import corpus_bertscore  # Inline import to use EASSE without installing all dependencies
        (
            metrics_scores["bertscore_precision"],
            metrics_scores["bertscore_recall"],
            metrics_scores["bertscore_f1"],
        ) = corpus_bertscore(sys_sents,
                             refs_sents,
                             tokenizer=tokenizer,
                             lowercase=lowercase)

    if analysis:
        from easse.annotation.word_level import WordOperationAnnotator  # Inline import to use EASSE without installing all dependencies
        word_operation_annotator = WordOperationAnnotator(tokenizer=tokenizer,
                                                          lowercase=lowercase,
                                                          verbose=True)
        metrics_scores[
            "word_level_analysis"] = word_operation_annotator.analyse_operations(
                orig_sents, sys_sents, refs_sents, as_str=True)

    if quality_estimation:
        metrics_scores["quality_estimation"] = corpus_quality_estimation(
            orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase)

    return metrics_scores
Exemplo n.º 15
0
def get_qualitative_html_examples(orig_sents, sys_sents, refs_sents):
    title_key_print = [
        ('Randomly sampled simplifications',
         lambda c, s, refs: 0,
         lambda value: ''),
        ('Best simplifications according to SARI',
         lambda c, s, refs: -corpus_sari([c], [s], [refs]),
         lambda value: f'SARI={-value:.2f}'),
        ('Worst simplifications according to SARI',
         lambda c, s, refs: corpus_sari([c], [s], [refs]),
         lambda value: f'SARI={value:.2f}'),
        ('Simplifications with only one differing word',
         lambda c, s, refs: -(count_words(c) == count_words(s) == len(get_lcs(to_words(c), to_words(s))) + 1),
         lambda value: ''),
        ('Simplifications with the most compression',
         lambda c, s, refs: get_compression_ratio(c, s),
         lambda value: f'compression_ratio={value:.2f}'),
        ('Simplifications that are longer than the source',
         lambda c, s, refs: -get_compression_ratio(c, s),
         lambda value: f'compression_ratio={-value:.2f}'),
        ('Simplifications that paraphrase the source',
         lambda c, s, refs: get_levenshtein_similarity(c, s) / get_compression_ratio(c, s),
         lambda value: f'levenshtein_similarity={value:.2f}'),
        ('Simplifications that are the most similar to the source (excluding exact matches)',
         lambda c, s, refs: -get_levenshtein_similarity(c, s) * int(c != s),
         lambda value: f'levenshtein_similarity={-value:.2f}'),
        ('Simplifications with the most sentence splits (if there are any)',
         lambda c, s, refs: -count_sentence_splits(c, s),
         lambda value: f'nb_sentences_ratio={-value:.2f}'),
    ]

    def get_one_sample_html(orig_sent, sys_sent, ref_sents, sort_key, print_func):
        doc = Doc()
        with doc.tag('div', klass='mb-2 p-1'):
            # Sort key
            with doc.tag('div', klass='text-muted small'):
                doc.asis(print_func(sort_key(orig_sent, sys_sent, ref_sents)))
            with doc.tag('div', klass='ml-2'):
                orig_sent_bold, sys_sent_bold = make_differing_words_bold(orig_sent, sys_sent, make_text_bold_html)
                # Source
                with doc.tag('div'):
                    doc.asis(orig_sent_bold)
                # Prediction
                with doc.tag('div'):
                    doc.asis(sys_sent_bold)
                # References
                collapse_id = get_random_html_id()
                with doc.tag('div', klass='position-relative'):
                    with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'),
                                 klass='stretched-link small'):
                        doc.text('References')
                    with doc.tag('div', klass='collapse', id=collapse_id):
                        for ref_sent in refs:
                            _, ref_sent_bold = make_differing_words_bold(orig_sent, ref_sent, make_text_bold_html)
                            with doc.tag('div', klass='text-muted'):
                                doc.asis(ref_sent_bold)
        return doc.getvalue()

    doc = Doc()
    for title, sort_key, print_func in title_key_print:
        # stretched-link needs position-relative
        with doc.tag('div', klass='container-fluid mt-4 p-2 position-relative border'):
            doc.line('h3', klass='m-2', text_content=title)
            # Make whole div clickable to collapse / uncollapse examples
            collapse_id = get_random_html_id()
            with doc.tag('a', ('data-toggle', 'collapse'), ('href', f'#{collapse_id}'), klass='stretched-link'):
                pass  # doc.stag and doc.line don't seem to work with stretched-link
            # Now lets print the examples
            sample_generator = sorted(
                    zip(orig_sents, sys_sents, zip(*refs_sents)),
                    key=lambda args: sort_key(*args),
            )
            # Samples displayed by default
            with doc.tag('div', klass='collapse show', id=collapse_id):
                n_samples = 10
                for i, (orig_sent, sys_sent, refs) in enumerate(sample_generator):
                    if i >= n_samples:
                        break
                    doc.asis(get_one_sample_html(orig_sent, sys_sent, refs, sort_key, print_func))
    return doc.getvalue()
Exemplo n.º 16
0
Arquivo: cli.py Projeto: ml-lab/easse
def evaluate_system_output(
        test_set,
        input_path=None,
        tokenizer='13a',
        metrics=','.join(VALID_METRICS),
        analysis=False,
        quality_estimation=False,
        ):
    """
    Evaluate a system output with automatic metrics.
    """
    if input_path is not None:
        sys_output = read_lines(input_path)
    else:
        # read the system output
        with click.get_text_stream('stdin', encoding='utf-8') as system_output_file:
            sys_output = system_output_file.read().splitlines()

    # get the metrics that need to be computed
    metrics = metrics.split(',')

    load_orig_sents = ('sari' in metrics) or ('samsa' in metrics) or analysis or quality_estimation
    load_refs_sents = ('sari' in metrics) or ('bleu' in metrics) or analysis
    # get the references from the test set
    if test_set in ['turk', 'turk_valid']:
        lowercase = False
        phase = 'test' if test_set == 'turk' else 'valid'
        if load_orig_sents:
            orig_sents = get_turk_orig_sents(phase=phase)
        if load_refs_sents:
            refs_sents = get_turk_refs_sents(phase=phase)

    if test_set in ['pwkp', 'pwkp_valid']:
        lowercase = True
        phase = 'test' if test_set == 'pwkp' else 'valid'
        if load_orig_sents:
            orig_sents = get_pwkp_orig_sents(phase=phase)
        if load_refs_sents:
            refs_sents = get_pwkp_refs_sents(phase=phase)

    if test_set == 'hsplit':
        sys_output = sys_output[:70]
        lowercase = True
        if load_orig_sents:
            orig_sents = get_hsplit_orig_sents()
        if load_refs_sents:
            refs_sents = get_hsplit_refs_sents()

    if load_orig_sents:
        assert len(sys_output) == len(orig_sents)
    if load_refs_sents:
        assert len(sys_output) == len(refs_sents[0])

    # compute each metric
    if 'bleu' in metrics:
        bleu_score = sacrebleu.corpus_bleu(sys_output, refs_sents,
                                           force=True, tokenize=tokenizer, lowercase=lowercase).score
        click.echo(f'BLEU: {bleu_score:.2f}')

    if 'sari' in metrics:
        sari_score = corpus_sari(orig_sents, sys_output, refs_sents, tokenizer=tokenizer, lowercase=lowercase)
        click.echo(f'SARI: {sari_score:.2f}')

    if 'samsa' in metrics:
        samsa_score = corpus_samsa(orig_sents, sys_output, tokenizer=tokenizer, verbose=True, lowercase=lowercase)
        click.echo(f'SAMSA: {samsa_score:.2f}')

    if 'fkgl' in metrics:
        fkgl_score = corpus_fkgl(sys_output, tokenizer=tokenizer)
        click.echo(f'FKGL: {fkgl_score:.2f}')

    if analysis:
        word_level_analysis = corpus_analyse_operations(orig_sents, sys_output, refs_sents,
                                                        verbose=False, as_str=True)
        click.echo(f'Word-level Analysis: {word_level_analysis}')

    if quality_estimation:
        quality_estimation_scores = corpus_quality_estimation(
                orig_sents,
                sys_output,
                tokenizer=tokenizer,
                lowercase=lowercase
                )
        quality_estimation_scores = {k: round(v, 2) for k, v in quality_estimation_scores.items()}
        click.echo(f'Quality estimation: {quality_estimation_scores}')