def main(
    prediction_report_tsv: str,
    datum_ids_jsonl: Optional[str],
    scores_json: str,
) -> None:
    prediction_report_df = pd.read_csv(
        prediction_report_tsv,
        sep="\t",
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        na_values=None,
        keep_default_na=False,
    )
    assert not prediction_report_df.isnull().any().any()

    if datum_ids_jsonl:
        datum_ids = set(
            load_jsonl_file(data_jsonl=datum_ids_jsonl,
                            cls=TurnId,
                            verbose=False))
        mask_datum_id = [
            TurnId(dialogue_id=row.get("dialogueId"),
                   turn_index=row.get("turnIndex")) in datum_ids
            for _, row in prediction_report_df.iterrows()
        ]
        prediction_report_df = prediction_report_df.loc[mask_datum_id]

    scores = evaluate_dataset(prediction_report_df)
    with open(scores_json, "w") as fp:
        fp.write(jsons.dumps(scores, jdkwargs={"indent": 2}))
        fp.write("\n")
def build_dialogue_report(
    dataflow_dialogues: List[Dialogue],
) -> Tuple[pd.DataFrame, List[TurnId], List[TurnId]]:
    refer_turn_ids = []
    revise_turn_ids = []
    report_rows = []

    for dialogue in dataflow_dialogues:
        num_turns = len(dialogue.turns)
        num_kept_turns = 0
        num_skipped_turns = 0
        num_refer_turns = 0
        num_revise_turns = 0
        for turn in dialogue.turns:
            if turn.skip:
                num_skipped_turns += 1
                continue

            num_kept_turns += 1
            if is_refer_turn(turn):
                num_refer_turns += 1
                refer_turn_ids.append(
                    TurnId(dialogue_id=dialogue.dialogue_id,
                           turn_index=turn.turn_index))
            if is_revise_turn(turn):
                num_revise_turns += 1
                revise_turn_ids.append(
                    TurnId(dialogue_id=dialogue.dialogue_id,
                           turn_index=turn.turn_index))

        report_rows.append({
            "dialogueId": dialogue.dialogue_id,
            "numTurns": num_turns,
            "numKeptTurns": num_kept_turns,
            "numSkippedTurns": num_skipped_turns,
            "numReferTurns": num_refer_turns,
            "numReviseTurns": num_revise_turns,
        })

    report_df = pd.DataFrame(report_rows)
    return report_df, refer_turn_ids, revise_turn_ids
Пример #3
0
def main(
    dataflow_dialogues_jsonl: str,
    dialogue_id_prefix: str,
    contextualized_turns_file: str,
    turn_answers_file: str,
) -> None:
    new_dialogue_id_index = 0
    new_dialogue_ids = [get_random_string(16) for _ in range(500000)]
    new_dialogue_ids = list(set(new_dialogue_ids))
    contextualized_turns: List[UtteranceWithContext] = []
    turn_predictons: List[TurnAnswer] = []

    for line in tqdm(open(dataflow_dialogues_jsonl), unit=" dialogues"):
        dialogue: Dialogue
        dialogue = jsons.loads(line.strip(), Dialogue)
        for turn_index, turn in enumerate(dialogue.turns):
            if turn.skip:
                continue
            full_dialogue_id = (
                dialogue_id_prefix + "-" + new_dialogue_ids[new_dialogue_id_index]
            )
            datum_id = TurnId(full_dialogue_id, turn.turn_index)
            contextualized_turn = UtteranceWithContext(
                datum_id=datum_id,
                user_utterance=turn.user_utterance,
                context=Dialogue(
                    dialogue_id=full_dialogue_id, turns=dialogue.turns[:turn_index]
                ),
            )
            contextualized_turns.append(contextualized_turn)
            turn_predictons.append(
                TurnAnswer(
                    datum_id=datum_id,
                    user_utterance=turn.user_utterance.original_text,
                    lispress=" ".join(turn.tokenized_lispress()),
                    program_execution_oracle=turn.program_execution_oracle,
                )
            )
            new_dialogue_id_index += 1

    random.shuffle(contextualized_turns)
    save_jsonl_file(contextualized_turns, contextualized_turns_file)
    save_jsonl_file(turn_predictons, turn_answers_file)
def create_onmt_text_datum_for_turn(
    dialogue_id: str,
    curr_turn: Turn,
    context_turns: List[Turn],
    include_program: bool,
    include_agent_utterance: bool,
    include_described_entities: bool,
) -> OnmtTextDatum:
    """Creates the OpenNMT text datum for a turn."""
    datum_id_str = jsons.dumps(TurnId(dialogue_id, curr_turn.turn_index))
    src_str = create_source_str(
        curr_turn=curr_turn,
        context_turns=context_turns,
        include_program=include_program,
        include_agent_utterance=include_agent_utterance,
        include_described_entities=include_described_entities,
        tokenize_utterance=False,
    )
    src_tok_str = create_source_str(
        curr_turn=curr_turn,
        context_turns=context_turns,
        include_program=include_program,
        include_agent_utterance=include_agent_utterance,
        include_described_entities=include_described_entities,
        tokenize_utterance=True,
    )
    tgt_str = " ".join(curr_turn.tokenized_lispress())

    # make sure there are not consecutive spaces in the tokenized sequence
    assert re.search(r"\s{2,}", src_tok_str) is None
    assert re.search(r"\s{2,}", tgt_str) is None

    return OnmtTextDatum(
        datum_id_str=datum_id_str,
        src_str=src_str,
        src_tok_str=src_tok_str,
        tgt_str=tgt_str,
    )
Пример #5
0
def main(
    exp0_prediction_report_tsv: str,
    exp1_prediction_report_tsv: str,
    datum_ids_jsonl: Optional[str],
    scores_json: str,
) -> None:
    """Loads the two prediction report files and calculates statistical significance.

    For the turn-level and dialogue-level accuracy, we use the McNemar test.
    For the dialogue-level prefix length (i.e., the number of turns before the first error), we use the two-sample permutation test.

    If `datum_ids_jsonl` is given, we only use the subset of turns specified in the file. In this case, only turn-level
    metrics are used since it doesn't make sense to compute dialogue-level metrics with only a subset of turns.
    """
    exp0_prediction_report_df = pd.read_csv(
        exp0_prediction_report_tsv,
        sep="\t",
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        na_values=None,
        keep_default_na=False,
    )
    assert not exp0_prediction_report_df.isnull().any().any()

    exp1_prediction_report_df = pd.read_csv(
        exp1_prediction_report_tsv,
        sep="\t",
        encoding="utf-8",
        quoting=csv.QUOTE_ALL,
        na_values=None,
        keep_default_na=False,
    )
    assert not exp1_prediction_report_df.isnull().any().any()

    turn_report_df, dialogue_report_df = get_report_dataframes(
        exp0_prediction_report_df=exp0_prediction_report_df,
        exp1_prediction_report_df=exp1_prediction_report_df,
    )

    if not datum_ids_jsonl:
        turn_statistic, turn_pvalue = run_mcnemar_test(turn_report_df)
        dialogue_statistic, dialogue_pvalue = run_mcnemar_test(dialogue_report_df)
        prefix_pvalue = run_paired_permutation_test(
            xs=dialogue_report_df.loc[:, "prefix_0"].tolist(),
            ys=dialogue_report_df.loc[:, "prefix_1"].tolist(),
        )

        with open(scores_json, "w") as fp:
            fp.write(
                json.dumps(
                    {
                        "turn": {"statistic": turn_statistic, "pvalue": turn_pvalue},
                        "dialogue": {
                            "statistic": dialogue_statistic,
                            "pvalue": dialogue_pvalue,
                        },
                        "prefix": {"pvalue": prefix_pvalue},
                    },
                    indent=2,
                )
            )
            fp.write("\n")

    else:
        datum_ids = set(
            load_jsonl_file(data_jsonl=datum_ids_jsonl, cls=TurnId, verbose=False)
        )
        mask_datum_id = [
            TurnId(dialogue_id=dialogue_id, turn_index=turn_index) in datum_ids
            for (dialogue_id, turn_index), row in exp1_prediction_report_df.iterrows()
        ]
        turn_report_df = turn_report_df.loc[mask_datum_id]
        # NOTE: We only compute turn-level statistics since it doesn't make sense to compute dialogue-level metrics
        # with only a subset of turns.
        turn_statistic, turn_pvalue = run_mcnemar_test(turn_report_df)

        with open(scores_json, "w") as fp:
            fp.write(
                json.dumps(
                    {"turn": {"statistic": turn_statistic, "pvalue": turn_pvalue}},
                    indent=2,
                )
            )
            fp.write("\n")