def generate_final_annotation_files() -> NoReturn: """ Generating the final gold annotations - 1. taking the .arbit file and adding the predicates with isVerbal==false,false from generation (that haven't been sent to consolidation). 2. Anonymize worker-id 3. Adjust CSV columns """ gen_dir_path = "files/annotations/gold_set/generation/corrected_filtered" arb_dir_path = "files/annotations/gold_set/arbitration" dest_path = "files/annotations/gold_set/final" arb_name_to_gen_name = lambda name: '.'.join(['annot'] + name.split('.')[1: ]) ann_files = [(os.path.join(arb_dir_path, fn), os.path.join(gen_dir_path, arb_name_to_gen_name(fn))) for fn in os.listdir(arb_dir_path) if fn.endswith(".csv") and arb_name_to_gen_name(fn) in os.listdir(gen_dir_path)] # prepare worker anonymization (dataset-wide) anonymization: Dict[str, str] = get_anonymization(all_worker_ids) for arb_fn, gen_fn in ann_files: arb_df = read_annot_csv(arb_fn) gen_df = read_annot_csv(gen_fn) # combine arb with (false,false) predicates from gen combined_df = combine_to_final_annot(arb_df=arb_df, gen_df=gen_df) # make internal aesthetic modifications in the DataFrame final_df = convert_to_final_annot(combined_df, anonymization) # save fn = os.path.basename(arb_fn) # remove prefix and put new one fn = 'annot.final.' + fn.lstrip("arbit.") dest_fn = os.path.join(dest_path, fn) save_annot_csv(final_df, dest_fn)
def postprocess_annotation_files( orig_dir: str, dest_dir: str, process_annot_func: Callable[[ pd.DataFrame, ], pd.DataFrame], file_name_modification_func: Callable[[ str, ], str] = lambda s: s) -> NoReturn: """ :param orig_dir: Directory from which to take the annottion to process (input) :param dest_dir: Directory to which the processed annotation files are to be exported :param process_annot_func: a function that gets an annot_df and returns a processed (i.e. corrected or changed, to some aspect) annot_df :param file_name_modification_func: how to change an annotation file-name from source-dir to dest-dir :return: """ ann_files = [ os.path.join(orig_dir, fn) for fn in os.listdir(orig_dir) if fn.endswith(".csv") ] for orig_fn in ann_files: orig_df = read_annot_csv(orig_fn) new_df = process_annot_func(orig_df) # now export to file with same naming as orig (but in destination folder) orig_dir, orig_name = os.path.split(orig_fn) new_name = file_name_modification_func(orig_name) dest_fn = os.path.join(dest_dir, new_name) save_annot_csv(new_df, dest_fn) print(f"exported annotations to {dest_fn}")
def generate_pruned_dupl_annot() -> NoReturn: gen_dupl_fn = "files/annotations/gold_set/generation/corrected_filtered/annot.dupl.wikinews.dev.5.csv" arb_dupl_fn = "files/annotations/gold_set/arbitration/arbit.dupl.wikinews.dev.5.csv" out_fn = "files/annotations/gold_set/final/annot.final.wikinews.dev.5.csv" gen_dupl_df = read_annot_csv(gen_dupl_fn) arb_dupl_df = read_annot_csv(arb_dupl_fn) pruned_final_df = prune_duplicated_annot(gen_dupl_df, arb_dupl_df) save_annot_csv(pruned_final_df, out_fn)
def fix_annot_with_nmr_blacklist(orig_annot_fn: str, dest_dir: str) -> NoReturn: orig_df = read_annot_csv(orig_annot_fn) filtered_df = remove_NMR_cases_from_annotations(orig_df) # now export to file with same naming as orig (but in destination folder) orig_dir, orig_name = os.path.split(orig_annot_fn) dest_fn = os.path.join(dest_dir, orig_name) save_annot_csv(filtered_df, dest_fn)
def jsonl_file_to_csv(qasrl_v2_fn: str, dest_dir: str) -> NoReturn: with open(qasrl_v2_fn, encoding='latin-1') as f: annot_df = pd.concat( [sentence_json_to_df(json.loads(jline)) for jline in f], ignore_index=True, sort=False) # save df in destination orig_dir, orig_name = os.path.split(qasrl_v2_fn) new_name = '.'.join(orig_name.split('.')[:-1]) + ".csv" dest_fn = os.path.join(dest_dir, new_name) save_annot_csv(decode_qasrl(annot_df), dest_fn)
def fix_annot_with_corrected( orig_annot_fn: str, corrected_annot_fn: str, dest_dir: str = "files/annotations/production/corrected") -> NoReturn: orig_df = read_annot_csv(orig_annot_fn) all_corrected_df = read_annot_csv(corrected_annot_fn) corrected_df = replace_some_annotations(orig_df, all_corrected_df) # in addition to re-annotation correction, filter out currently invalid prompts for data corrected_df = find_invalid_prompts(corrected_df) corrected_and_filtered_df = corrected_df[~corrected_df.invalid_prompt] final_df = corrected_and_filtered_df.drop( ["corrected_verb_form", "invalid_prompt"], axis=1) # now export to file with same naming as orig (but in destination folder) orig_dir, orig_name = os.path.split(orig_annot_fn) dest_fn = os.path.join(dest_dir, orig_name) save_annot_csv(final_df, dest_fn)
def generate_final_train_annotations() -> NoReturn: """ Generating the final train-set annotations - 1. Anonymize worker-id 2. Adjust CSV columns """ orig_train_dir_path = "files/annotations/train_set/filtered" dest_path = "files/annotations/train_set/final" ann_files = [ os.path.join(orig_train_dir_path, fn) for fn in os.listdir(orig_train_dir_path) if fn.endswith('.csv') ] # prepare worker anonymization (dataset-wide) anonymization: Dict[str, str] = get_anonymization(all_worker_ids) for gen_fn in ann_files: gen_df = read_annot_csv(gen_fn) # make internal aesthetic modifications in the DataFrame final_df = convert_to_final_annot(gen_df, anonymization) # save fn = os.path.basename(gen_fn) dest_fn = os.path.join(dest_path, fn) save_annot_csv(final_df, dest_fn)