def evaluate_prediction_file(predictions_jsonl: str, gold_jsonl: str,
                             datum_ids_jsonl: Optional[str]) -> float:
    preds = list(
        load_jsonl_file(predictions_jsonl, TurnPrediction, verbose=False))
    golds = list(load_jsonl_file(gold_jsonl, TurnAnswer, verbose=False))
    datum_ids = (None if datum_ids_jsonl is None else set(
        load_jsonl_file(data_jsonl=datum_ids_jsonl, cls=TurnId,
                        verbose=False)))
    return evaluate_predictions_exact_match(collate(preds, golds, datum_ids))
def main(
    prediction_report_tsv: str,
    datum_ids_jsonl: Optional[str],
    scores_json: str,
) -> None:
    prediction_report_df = pd.read_csv(
        prediction_report_tsv,
        sep="\t",
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        na_values=None,
        keep_default_na=False,
    )
    assert not prediction_report_df.isnull().any().any()

    if datum_ids_jsonl:
        datum_ids = set(
            load_jsonl_file(data_jsonl=datum_ids_jsonl,
                            cls=TurnId,
                            verbose=False))
        mask_datum_id = [
            TurnId(dialogue_id=row.get("dialogueId"),
                   turn_index=row.get("turnIndex")) in datum_ids
            for _, row in prediction_report_df.iterrows()
        ]
        prediction_report_df = prediction_report_df.loc[mask_datum_id]

    scores = evaluate_dataset(prediction_report_df)
    with open(scores_json, "w") as fp:
        fp.write(jsons.dumps(scores, jdkwargs={"indent": 2}))
        fp.write("\n")
def main(dataflow_dialogues_dir: str, subsets: List[str], outdir: str):
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    dialogue_report_dfs = []
    for subset in subsets:
        dataflow_dialogues = list(
            load_jsonl_file(
                data_jsonl=os.path.join(dataflow_dialogues_dir,
                                        f"{subset}.dataflow_dialogues.jsonl"),
                cls=Dialogue,
                unit=" dialogues",
            ))

        dialogue_report_df, refer_turn_ids, revise_turn_ids = build_dialogue_report(
            dataflow_dialogues)
        dialogue_report_dfs.append(dialogue_report_df)

        save_jsonl_file(
            data=refer_turn_ids,
            data_jsonl=os.path.join(outdir, f"{subset}.refer_turn_ids.jsonl"),
        )
        save_jsonl_file(
            data=revise_turn_ids,
            data_jsonl=os.path.join(outdir, f"{subset}.revise_turn_ids.jsonl"),
        )

        basic_stats, percentile_stats = compute_stats(dialogue_report_df)
        with open(os.path.join(outdir, f"{subset}.basic_stats.json"),
                  "w") as fp:
            fp.write(json.dumps(dataclasses.asdict(basic_stats), indent=2))
            fp.write("\n")
        with open(os.path.join(outdir, f"{subset}.percentile_stats.json"),
                  "w") as fp:
            fp.write(json.dumps(percentile_stats, indent=2))
            fp.write("\n")

    if len(subsets) > 1:
        basic_stats, percentile_stats = compute_stats(
            pd.concat(dialogue_report_dfs))
        with open(
                os.path.join(outdir, f"{'-'.join(subsets)}.basic_stats.json"),
                "w") as fp:
            fp.write(json.dumps(dataclasses.asdict(basic_stats), indent=2))
            fp.write("\n")
        with open(
                os.path.join(outdir,
                             f"{'-'.join(subsets)}.percentile_stats.json"),
                "w") as fp:
            fp.write(json.dumps(percentile_stats, indent=2))
            fp.write("\n")
Exemplo n.º 4
0
def main(
    dialogues_jsonl: str,
    datum_id_jsonl: str,
    src_txt: str,
    ref_txt: str,
    nbest_txt: str,
    nbest: int,
    outbase: str,
) -> None:
    """Creates 1-best predictions and saves them to files."""
    datum_lookup: Dict[str, Dict[int, Turn]] = {
        dialogue.dialogue_id:
        {turn.turn_index: turn
         for turn in dialogue.turns}
        for dialogue in load_jsonl_file(
            data_jsonl=dialogues_jsonl, cls=Dialogue, unit=" dialogues")
    }

    prediction_report_jsonl = create_onmt_prediction_report(
        datum_lookup=datum_lookup,
        datum_id_jsonl=datum_id_jsonl,
        src_txt=src_txt,
        ref_txt=ref_txt,
        nbest_txt=nbest_txt,
        nbest=nbest,
        outbase=outbase,
    )

    predictions_lookup = load_jsonl_file_and_build_lookup(
        data_jsonl=prediction_report_jsonl,
        cls=OnmtPredictionReportDatum,
        primary_key_getter=lambda x: x.datum_id.dialogue_id,
        secondary_key_getter=lambda x: x.datum_id.turn_index,
    )
    dataflow_dialogues = build_dataflow_dialogues(predictions_lookup)
    save_jsonl_file(dataflow_dialogues, f"{outbase}.dataflow_dialogues.jsonl")
Exemplo n.º 5
0
def main(
    dialogues_file: str,
    no_refer: bool,
    no_revise: bool,
    cheating_mode: str,
    cheating_execution_results_file: Optional[str],
    outbase: str,
) -> Tuple[str, str, str]:
    salience_model: SalienceModelBase
    if no_refer:
        salience_model = DummySalienceModel()
    else:
        salience_model = VanillaSalienceModel()

    cheating_execution_results_lookup = None
    if cheating_execution_results_file is not None:
        cheating_execution_results_lookup = load_jsonl_file_and_build_lookup(
            data_jsonl=cheating_execution_results_file,
            cls=CompleteExecutionResult,
            primary_key_getter=lambda x: x.dialogue_id,
            secondary_key_getter=lambda x: x.turn_index,
        )

    complete_execution_results_file = outbase + ".execution_results.jsonl"
    cheating_report_file = outbase + ".cheating_report.jsonl"
    complete_execution_results_fp = open(complete_execution_results_file, "w")
    cheating_report_fp = open(cheating_report_file, "w")

    for dialogue in load_jsonl_file(
        data_jsonl=dialogues_file, cls=Dialogue, unit=" dialogues"
    ):
        if cheating_execution_results_lookup is None:
            cheating_execution_results = None
        else:
            cheating_execution_results = cheating_execution_results_lookup.get(
                dialogue.dialogue_id
            )
            assert cheating_execution_results is not None

        (
            complete_execution_results,
            cheating_turn_indices,
        ) = execute_programs_for_dialogue(
            dialogue=dialogue,
            salience_model=salience_model,
            no_revise=no_revise,
            cheating_mode=cheating_mode,
            cheating_execution_results=cheating_execution_results,
        )

        for complete_execution_result in complete_execution_results:
            complete_execution_results_fp.write(jsons.dumps(complete_execution_result))
            complete_execution_results_fp.write("\n")

        num_total_turns = len(dialogue.turns)
        assert (
            dialogue.turns[-1].turn_index - dialogue.turns[0].turn_index + 1
            == num_total_turns
        )
        num_cheating_turns = len(cheating_turn_indices)
        cheating_report_fp.write(
            json.dumps(
                {
                    "dialogueId": dialogue.dialogue_id,
                    "startTurnIndex": dialogue.turns[0].turn_index,
                    "numTurns": num_total_turns,
                    "cheatingTurnIndices": cheating_turn_indices,
                    "numCheatingTurns": num_cheating_turns,
                    "pctCheatingTurns": num_cheating_turns / num_total_turns,
                }
            )
        )
        cheating_report_fp.write("\n")
    complete_execution_results_fp.close()
    cheating_report_fp.close()

    cheating_stats_file = outbase + ".cheating_stats.json"
    analyze_cheating_report(cheating_report_file, cheating_stats_file)

    return complete_execution_results_file, cheating_report_file, cheating_stats_file
Exemplo n.º 6
0
def main(
    exp0_prediction_report_tsv: str,
    exp1_prediction_report_tsv: str,
    datum_ids_jsonl: Optional[str],
    scores_json: str,
) -> None:
    """Loads the two prediction report files and calculates statistical significance.

    For the turn-level and dialogue-level accuracy, we use the McNemar test.
    For the dialogue-level prefix length (i.e., the number of turns before the first error), we use the two-sample permutation test.

    If `datum_ids_jsonl` is given, we only use the subset of turns specified in the file. In this case, only turn-level
    metrics are used since it doesn't make sense to compute dialogue-level metrics with only a subset of turns.
    """
    exp0_prediction_report_df = pd.read_csv(
        exp0_prediction_report_tsv,
        sep="\t",
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        na_values=None,
        keep_default_na=False,
    )
    assert not exp0_prediction_report_df.isnull().any().any()

    exp1_prediction_report_df = pd.read_csv(
        exp1_prediction_report_tsv,
        sep="\t",
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        na_values=None,
        keep_default_na=False,
    )
    assert not exp1_prediction_report_df.isnull().any().any()

    turn_report_df, dialogue_report_df = get_report_dataframes(
        exp0_prediction_report_df=exp0_prediction_report_df,
        exp1_prediction_report_df=exp1_prediction_report_df,
    )

    if not datum_ids_jsonl:
        turn_statistic, turn_pvalue = run_mcnemar_test(turn_report_df)
        dialogue_statistic, dialogue_pvalue = run_mcnemar_test(dialogue_report_df)
        prefix_pvalue = run_paired_permutation_test(
            xs=dialogue_report_df.loc[:, "prefix_0"].tolist(),
            ys=dialogue_report_df.loc[:, "prefix_1"].tolist(),
        )

        with open(scores_json, "w") as fp:
            fp.write(
                json.dumps(
                    {
                        "turn": {"statistic": turn_statistic, "pvalue": turn_pvalue},
                        "dialogue": {
                            "statistic": dialogue_statistic,
                            "pvalue": dialogue_pvalue,
                        },
                        "prefix": {"pvalue": prefix_pvalue},
                    },
                    indent=2,
                )
            )
            fp.write("\n")

    else:
        datum_ids = set(
            load_jsonl_file(data_jsonl=datum_ids_jsonl, cls=TurnId, verbose=False)
        )
        mask_datum_id = [
            TurnId(dialogue_id=dialogue_id, turn_index=turn_index) in datum_ids
            for (dialogue_id, turn_index), row in exp1_prediction_report_df.iterrows()
        ]
        turn_report_df = turn_report_df.loc[mask_datum_id]
        # NOTE: We only compute turn-level statistics since it doesn't make sense to compute dialogue-level metrics
        # with only a subset of turns.
        turn_statistic, turn_pvalue = run_mcnemar_test(turn_report_df)

        with open(scores_json, "w") as fp:
            fp.write(
                json.dumps(
                    {"turn": {"statistic": turn_statistic, "pvalue": turn_pvalue}},
                    indent=2,
                )
            )
            fp.write("\n")