示例#1
0
def generate_final_annotation_files() -> NoReturn:
    """
    Generating the final gold annotations -
    1. taking the .arbit file and adding the predicates
    with isVerbal==false,false from generation (that haven't been sent to consolidation).
    2. Anonymize worker-id
    3. Adjust CSV columns
    """
    gen_dir_path = "files/annotations/gold_set/generation/corrected_filtered"
    arb_dir_path = "files/annotations/gold_set/arbitration"
    dest_path = "files/annotations/gold_set/final"
    arb_name_to_gen_name = lambda name: '.'.join(['annot'] + name.split('.')[1:
                                                                             ])
    ann_files = [(os.path.join(arb_dir_path, fn),
                  os.path.join(gen_dir_path, arb_name_to_gen_name(fn)))
                 for fn in os.listdir(arb_dir_path) if fn.endswith(".csv")
                 and arb_name_to_gen_name(fn) in os.listdir(gen_dir_path)]
    # prepare worker anonymization (dataset-wide)
    anonymization: Dict[str, str] = get_anonymization(all_worker_ids)
    for arb_fn, gen_fn in ann_files:
        arb_df = read_annot_csv(arb_fn)
        gen_df = read_annot_csv(gen_fn)
        # combine arb with (false,false) predicates from gen
        combined_df = combine_to_final_annot(arb_df=arb_df, gen_df=gen_df)
        # make internal aesthetic modifications in the DataFrame
        final_df = convert_to_final_annot(combined_df, anonymization)
        # save
        fn = os.path.basename(arb_fn)
        # remove prefix and put new one
        fn = 'annot.final.' + fn.lstrip("arbit.")
        dest_fn = os.path.join(dest_path, fn)
        save_annot_csv(final_df, dest_fn)
示例#2
0
def generate_pruned_dupl_annot() -> NoReturn:
    gen_dupl_fn = "files/annotations/gold_set/generation/corrected_filtered/annot.dupl.wikinews.dev.5.csv"
    arb_dupl_fn = "files/annotations/gold_set/arbitration/arbit.dupl.wikinews.dev.5.csv"
    out_fn = "files/annotations/gold_set/final/annot.final.wikinews.dev.5.csv"
    gen_dupl_df = read_annot_csv(gen_dupl_fn)
    arb_dupl_df = read_annot_csv(arb_dupl_fn)
    pruned_final_df = prune_duplicated_annot(gen_dupl_df, arb_dupl_df)
    save_annot_csv(pruned_final_df, out_fn)
示例#3
0
def main(proposed_path: str, reference_path: str, sentences_path: str):
    if sentences_path:
        sent_df = read_csv(sentences_path)
        sent_map = dict(zip(sent_df.qasrl_id, sent_df.tokens.apply(str.split)))
    else:
        sent_map = None

    sys_df = read_annot_csv(proposed_path)
    grt_df = read_annot_csv(reference_path)
    print_system_evaluation(sys_df, grt_df)
示例#4
0
def postprocess_annotation_files(
    orig_dir: str,
    dest_dir: str,
    process_annot_func: Callable[[
        pd.DataFrame,
    ], pd.DataFrame],
    file_name_modification_func: Callable[[
        str,
    ], str] = lambda s: s) -> NoReturn:
    """
    :param orig_dir: Directory from which to take the annottion to process (input)
    :param dest_dir: Directory to which the processed annotation files are to be exported
    :param process_annot_func: a function that gets an annot_df and returns a processed (i.e. corrected or changed,
    to some aspect) annot_df
    :param file_name_modification_func: how to change an annotation file-name from source-dir to dest-dir
    :return:
    """
    ann_files = [
        os.path.join(orig_dir, fn) for fn in os.listdir(orig_dir)
        if fn.endswith(".csv")
    ]
    for orig_fn in ann_files:
        orig_df = read_annot_csv(orig_fn)
        new_df = process_annot_func(orig_df)
        # now export to file with same naming as orig (but in destination folder)
        orig_dir, orig_name = os.path.split(orig_fn)
        new_name = file_name_modification_func(orig_name)
        dest_fn = os.path.join(dest_dir, new_name)
        save_annot_csv(new_df, dest_fn)
        print(f"exported annotations to {dest_fn}")
示例#5
0
def fix_annot_with_corrected(
        orig_annot_fn: str,
        corrected_annot_fn: str,
        dest_dir: str = "files/annotations/production/corrected") -> NoReturn:
    orig_df = read_annot_csv(orig_annot_fn)
    all_corrected_df = read_annot_csv(corrected_annot_fn)
    corrected_df = replace_some_annotations(orig_df, all_corrected_df)
    # in addition to re-annotation correction, filter out currently invalid prompts for data
    corrected_df = find_invalid_prompts(corrected_df)
    corrected_and_filtered_df = corrected_df[~corrected_df.invalid_prompt]
    final_df = corrected_and_filtered_df.drop(
        ["corrected_verb_form", "invalid_prompt"], axis=1)
    # now export to file with same naming as orig (but in destination folder)
    orig_dir, orig_name = os.path.split(orig_annot_fn)
    dest_fn = os.path.join(dest_dir, orig_name)
    save_annot_csv(final_df, dest_fn)
示例#6
0
def fix_annot_with_nmr_blacklist(orig_annot_fn: str,
                                 dest_dir: str) -> NoReturn:
    orig_df = read_annot_csv(orig_annot_fn)
    filtered_df = remove_NMR_cases_from_annotations(orig_df)
    # now export to file with same naming as orig (but in destination folder)
    orig_dir, orig_name = os.path.split(orig_annot_fn)
    dest_fn = os.path.join(dest_dir, orig_name)
    save_annot_csv(filtered_df, dest_fn)
示例#7
0
def qanom_csv_file_to_jsonl(qanom_csv_fn: str, dest_dir: str) -> NoReturn:
    annot_df = read_annot_csv(qanom_csv_fn)
    sentences_dicts = (
        sentence_df_to_sentence_jsonl_dict(sentence_df)
        for qasrl_id, sentence_df in annot_df.groupby('qasrl_id'))
    # save jsonl in destination
    orig_dir, orig_name = os.path.split(qanom_csv_fn)
    new_name = '.'.join(orig_name.split('.')[:-1]) + ".jsonl"
    dest_fn = os.path.join(dest_dir, new_name)
    jsonl.dump(sentences_dicts, open(dest_fn, "w"))
示例#8
0
def generate_final_train_annotations() -> NoReturn:
    """
    Generating the final train-set annotations -
    1. Anonymize worker-id
    2. Adjust CSV columns
    """
    orig_train_dir_path = "files/annotations/train_set/filtered"
    dest_path = "files/annotations/train_set/final"
    ann_files = [
        os.path.join(orig_train_dir_path, fn)
        for fn in os.listdir(orig_train_dir_path) if fn.endswith('.csv')
    ]
    # prepare worker anonymization (dataset-wide)
    anonymization: Dict[str, str] = get_anonymization(all_worker_ids)
    for gen_fn in ann_files:
        gen_df = read_annot_csv(gen_fn)
        # make internal aesthetic modifications in the DataFrame
        final_df = convert_to_final_annot(gen_df, anonymization)
        # save
        fn = os.path.basename(gen_fn)
        dest_fn = os.path.join(dest_path, fn)
        save_annot_csv(final_df, dest_fn)
示例#9
0
def get_worker_statistics_from_file(anot_fn):
    return get_worker_statistics(read_annot_csv(anot_fn))
示例#10
0
def main_iaa_per_worker(annotation_path: str):
    annot_df = read_annot_csv(annotation_path)
    annot_df = decode_qasrl(annot_df)
    print(annot_df.worker_id.value_counts())
    evaluate_per_worker_iaa(annot_df)
示例#11
0
def main(annotation_path: str):
    annot_df = read_annot_csv(annotation_path)
    annot_df = decode_qasrl(annot_df)
    # original annotations, multiple generation tasks per predicate
    print(annot_df.worker_id.value_counts())
    evaluate_inter_generator_agreement(annot_df, verbose=True)