def get_sentences(tag, is_training, debug=False): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: # d_list = d_list[:10] d_list = d_list[:50] # d_list = d_list[:200] doc_results = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl" ) doc_results_dict = list_dict_data_tool.list_to_dict(doc_results, 'id') fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_full_wiki_document_forward_item( doc_results_dict, d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items
def inspect_upstream_eval(): is_training = True debug_mode = True d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT) in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl' cur_eval_results_list = common.load_jsonl(in_file_name) top_k = 10 filter_value = 0.1 t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT) match_type = 'string' if debug_mode: d_list = d_list[:100] id_set = set([item['question'] for item in d_list]) cur_eval_results_list = [ item for item in cur_eval_results_list if item['qid'] in id_set ] d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question') copied_d_o_dict = copy.deepcopy(d_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) forward_example_items = build_open_qa_forword_item(cur_results_dict_top10, d_list, is_training, t_cursor, match_type) print(forward_example_items)
def p_eval(): dev_list = common.load_jsonl(config.FEVER_DEV) # common.save_jsonl(cur_eval_results_list, f"fever_p_level_{tag}_results.jsonl") cur_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl" ) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.005) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(score_05)
def post_process(): from pathlib import Path input_file = '/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_withlb/balance.jsonl' nli_results = common.load_jsonl(input_file) print("Post Processing enhancement") delete_unused_evidence(nli_results) print("Deleting Useless Evidence") current_pipeline_dir = Path( '/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_withlb' ) dev_sent_file_1 = current_pipeline_dir / "dev_sent_score_1_shared_task_test.jsonl" dev_sent_file_2 = current_pipeline_dir / "dev_sent_score_2_shared_task_test.jsonl" dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) print("Appending 1 of second Evidence") nli_results = simi_sampler.threshold_sampler_insure_unique_merge( nli_results, dev_sent_list_2, 0.9, top_n=5, add_n=1) delete_unused_evidence(nli_results) # High tolerance enhancement! print("Final High Tolerance Enhancement") print("Appending all of first Evidence") nli_results = simi_sampler.threshold_sampler_insure_unique_merge( nli_results, dev_sent_list_1, -1, top_n=5, add_n=100) delete_unused_evidence(nli_results) # if build_submission: output_file = current_pipeline_dir / "predictions.jsonl" build_submission_file(nli_results, output_file)
def inspect_upstream_eval_v1(): bert_model_name = "bert-base-uncased" bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' do_lower_case = True max_pre_context_length = 315 max_query_length = 64 doc_stride = 128 is_training = True debug_mode = True d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT) in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl' cur_eval_results_list = common.load_jsonl(in_file_name) top_k = 10 filter_value = 0.1 match_type = 'string' tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) fitems_dict, read_fitems_list, _ = get_open_qa_item_with_upstream_paragraphs( d_list, cur_eval_results_list, is_training, tokenizer, max_pre_context_length, max_query_length, doc_stride, debug_mode, top_k, filter_value, match_type) print(len(read_fitems_list)) print(len(fitems_dict))
def show_nli_binned_plot(y_axis_value): dataset_name = 'Natural Language Inference' task_name = 'uncertainty_nli' snli_data_file = config.CHAOSNLI_SNLI mnli_data_file = config.CHAOSNLI_MNLI model_pred_file = config.MODEL_PRED_NLI d_list_snli = common.load_jsonl(snli_data_file) d_list_mnli = common.load_jsonl(mnli_data_file) collected_data_dict = {} collected_data_dict_snli = list_dict_data_tool.list_to_dict(d_list_snli, key_fields='uid') collected_data_dict_mnli = list_dict_data_tool.list_to_dict(d_list_mnli, key_fields='uid') collected_data_dict.update(collected_data_dict_snli) collected_data_dict.update(collected_data_dict_mnli) model_prediction_dict = common.load_json(model_pred_file) bin_num = 5 split_type = 'quantile' column_name = 'ChaosNLI-(S+M)' bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type) bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict, task_name=task_name) plot_histogram(bined_item_results, y_axis_value, column_name)
def get_paragraph_forward_pair(tag, ruleterm_doc_results, is_training, debug=False, ignore_non_verifiable=False): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] ruleterm_doc_results = ruleterm_doc_results[:100] ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict( ruleterm_doc_results, 'id') db_cursor = fever_db.get_cursor() fitems = build_full_wiki_document_forward_item(ruleterm_doc_results_dict, d_list, is_training, db_cursor, ignore_non_verifiable) return fitems
def build_anli(path: Path, round=1, version='1.0'): data_root_path = (path / "anli") if not data_root_path.exists(): data_root_path.mkdir() round_tag = str(round) o_train = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/train.jsonl") o_dev = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/dev.jsonl") o_test = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/test.jsonl") d_trian = a_nli2std_format(o_train) d_dev = a_nli2std_format(o_dev) d_test = a_nli2std_format(o_test) print(f"ANLI (R{round_tag}) Train size:", len(d_trian)) print(f"ANLI (R{round_tag}) Dev size:", len(d_dev)) print(f"ANLI (R{round_tag}) Test size:", len(d_test)) if not (data_root_path / f"r{round_tag}").exists(): (data_root_path / f"r{round_tag}").mkdir() common.save_jsonl(d_trian, data_root_path / f"r{round_tag}" / 'train.jsonl') common.save_jsonl(d_dev, data_root_path / f"r{round_tag}" / 'dev.jsonl') common.save_jsonl(d_test, data_root_path / f"r{round_tag}" / 'test.jsonl')
def single_process_fever_with_dict(start=0, end=None, tag='dev'): task_name = 'fever' debug = False top_k = 20 query_fieldname = 'claim' id_fieldname = 'id' debug_name = 'debug' if debug else "" g_score_dict = dict() g_score_dict = load_from_file(g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") # Important Set this number !!! print("Total length:", len(d_list)) # start, end = 0, len(d_list) # Important End !!! print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}") d_list = d_list[start:end] print("Data length:", len(d_list)) if debug: d_list = d_list[:10] start, end = 0, 10 print("Data length (Pos-filtering):", len(d_list)) r_item_list = [] incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl" if incr_file.is_file(): print("Warning save file exists.") save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl" if save_path.is_file(): print("Warning save file exists.") with open(incr_file, mode='w', encoding='utf-8') as out_f: process_func = partial(process_fever_item_with_score_dict, top_k=top_k, query_field=query_fieldname, id_field=id_fieldname, global_score_dict=g_score_dict) for item in tqdm(d_list, total=len(d_list)): r_item = process_func(item) r_item_list.append(r_item) out_f.write(json.dumps(item) + '\n') out_f.flush() print(len(r_item_list)) common.save_jsonl(r_item_list, save_path)
def multi_process(start=0, end=None, tag='dev'): task_name = 'fever' debug = False top_k = 20 num_process = 3 query_fieldname = 'claim' id_fieldname = 'id' debug_name = 'debug' if debug else "" # print(multiprocessing.cpu_count()) print("CPU Count:", multiprocessing.cpu_count()) if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") print("Total length:", len(d_list)) # Important Set this number !!! # start, end = 0, None # Important End !!! print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}") d_list = d_list[start:end] print("Data length:", len(d_list)) if debug: d_list = d_list[:10] start, end = 0, 10 print("Data length (Pos-filtering):", len(d_list)) r_list = [] incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl" if incr_file.is_file(): print("Warning save file exists.") save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl" if save_path.is_file(): print("Warning save file exists.") with open(incr_file, mode='w', encoding='utf-8') as out_f: with Pool(processes=num_process, maxtasksperchild=1000) as pool: process_func = partial(process_fever_item_multiprocessing, top_k=top_k, query_field=query_fieldname, id_field=id_fieldname) p_item_list = pool.imap_unordered(process_func, d_list) for item in tqdm(p_item_list, total=len(d_list)): r_list.append(item) out_f.write(json.dumps(item) + '\n') out_f.flush() print(len(r_list)) common.save_jsonl(r_list, save_path)
def get_inference_pair(tag, is_training, sent_result_path, debug_num=None, evidence_filtering_threshold=0.01): # sent_result_path = "" if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug_num is not None: # d_list = d_list[:10] d_list = d_list[:50] # d_list = d_list[:200] d_dict = list_dict_data_tool.list_to_dict(d_list, 'id') threshold_value = evidence_filtering_threshold # sent_list = common.load_jsonl( # config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl") # sent_list = common.load_jsonl( # config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json") # debug_num = None if not debug else 2971 # debug_num = None if isinstance(sent_result_path, Path): sent_list = common.load_jsonl(sent_result_path, debug_num) elif isinstance(sent_result_path, list): sent_list = sent_result_path else: raise ValueError( f"{sent_result_path} is not of a valid argument type which should be [list, Path]." ) list_dict_data_tool.append_subfield_from_list_to_dict(sent_list, d_dict, 'oid', 'fid', check=True) filltered_sent_dict = select_top_k_and_to_results_dict( d_dict, top_k=5, threshold=threshold_value) list_dict_data_tool.append_item_from_dict_to_list( d_list, filltered_sent_dict, 'id', ['predicted_evidence', 'predicted_scored_evidence']) fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_nli_forward_item(d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items, d_list
def prepare_forward_data(dataset_name, tag, is_training, upstream_top_k=20, distant_gt_top_k=2, down_sample_ratio=None, debug=False): if dataset_name == 'webq' and tag == 'test': gt_d_list_path = config.OPEN_WEBQ_TEST_GT elif dataset_name == 'webq' and tag == 'train': gt_d_list_path = config.OPEN_WEBQ_TRAIN_GT elif dataset_name == 'curatedtrec' and tag == 'test': gt_d_list_path = config.OPEN_CURATEDTERC_TEST_GT elif dataset_name == 'curatedtrec' and tag == 'train': gt_d_list_path = config.OPEN_CURATEDTERC_TRAIN_GT elif dataset_name == 'squad' and tag == 'dev': gt_d_list_path = config.OPEN_SQUAD_DEV_GT elif dataset_name == 'squad' and tag == 'train': gt_d_list_path = config.OPEN_SQUAD_TRAIN_GT elif dataset_name == 'wikimovie' and tag == 'test': gt_d_list_path = config.OPEN_WIKIM_TEST_GT elif dataset_name == 'wikimovie' and tag == 'train': gt_d_list_path = config.OPEN_WIKIM_TRAIN_GT else: raise NotImplemented() t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT) # debug = False # upstream_top_k = 20 # distant_gt_top_k = 2 # down_sample_ratio = None if dataset_name != 'wikimovie': upstream_d_list_before_filter = common.load_jsonl( config.PRO_ROOT / f"data/p_{dataset_name}/tf_idf_p_level/{dataset_name}_{tag}_para_tfidf.jsonl") else: upstream_d_list_before_filter = common.load_jsonl( config.PRO_ROOT / f"data/p_{dataset_name}/kwm_p_level/{dataset_name}_{tag}_kwm_tfidf.jsonl") if debug: upstream_d_list_before_filter = upstream_d_list_before_filter[:50] upstream_d_list = top_k_filter_score_list(upstream_d_list_before_filter, top_k=upstream_top_k) upstream_d_dict = list_dict_data_tool.list_to_dict(upstream_d_list, 'question') gt_d_list = common.load_jsonl(gt_d_list_path) gt_d_dict = list_dict_data_tool.list_to_dict(gt_d_list, 'question') distant_gt_item_list = get_distant_top_k_ground_truth(gt_d_dict, upstream_d_list_before_filter, top_k=distant_gt_top_k) distant_gt_item_dict = list_dict_data_tool.list_to_dict(distant_gt_item_list, 'qid') fitems_list = build_p_level_forward_item(upstream_d_dict, distant_gt_item_dict, upstream_d_list, is_training, t_cursor) if is_training: return down_sample_neg(fitems_list, down_sample_ratio) else: return down_sample_neg(fitems_list, None)
def get_nli_pair(tag, is_training, sent_level_results_list, debug=None, sent_top_k=5, sent_filter_value=0.05): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] # sent_dict = list_dict_data_tool.list_to_dict(sent_level_results_list): d_dict = list_dict_data_tool.list_to_dict(d_list, 'id') if debug: id_set = set([item['id'] for item in d_list]) new_sent_list = [] for item in sent_level_results_list: if item["qid"] in id_set: new_sent_list.append(item) sent_level_results_list = new_sent_list list_dict_data_tool.append_subfield_from_list_to_dict( sent_level_results_list, d_dict, 'qid', 'fid', check=True) filltered_sent_dict = select_top_k_and_to_results_dict( d_dict, score_field_name='prob', top_k=sent_top_k, filter_value=sent_filter_value, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( d_list, filltered_sent_dict, 'id', ['predicted_evidence', 'selected_scored_results']) fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_nli_forward_item(d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items, d_list
def eval_ensemble(): sent_file = config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl" dev_sent_filtering_prob = 0.01 tag = 'dev' top_k = 5 # dev_list = common.load_jsonl(config.FEVER_DEV) dev_sent_results_list = common.load_jsonl(sent_file) dev_fitems, dev_list = get_nli_pair( tag, is_training=False, sent_level_results_list=dev_sent_results_list, debug=False, sent_top_k=top_k, sent_filter_value=dev_sent_filtering_prob) pred_file_list = [ config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_label_results_th0.2.jsonl", config.PRO_ROOT / "data/p_fever/fever_nli/04-26-10:15:39_fever_v2_nli_th0.2/ema_i(14000)|e(2)|ss(0.6991199119911992)|ac(0.7492249224922493)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl", config.PRO_ROOT / "data/p_fever/fever_nli/04-27-10:03:27_fever_v2_nli_th0.2/ema_i(26000)|e(3)|ss(0.6958695869586958)|ac(0.7447744774477447)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl", ] pred_d_list = [common.load_jsonl(file) for file in pred_file_list] final_list = ensemble_nli_results(pred_d_list) pred_list = final_list ema_results_dict = list_dict_data_tool.list_to_dict(pred_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') dev_list = common.load_jsonl(config.FEVER_DEV) mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item)
def threshold_sampler_insure_unique_merge(org_data_file, full_sent_list, prob_threshold=0.5, top_n=5, add_n=1): """ Providing samples to the Training set by a probability threshold on the upstream selected sentences. """ if not isinstance(org_data_file, list): d_list = common.load_jsonl(org_data_file) else: d_list = org_data_file augmented_dict = dict() for sent_item in full_sent_list: selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) remain_str = selection_id.split('<##>')[1] if org_id in augmented_dict: if remain_str not in augmented_dict[org_id]: augmented_dict[org_id][remain_str] = sent_item else: augmented_dict[org_id] = {remain_str: sent_item} for item in d_list: if int(item['id']) not in augmented_dict: # print("Potential error?") cur_predicted_sentids = [] else: cur_predicted_sentids = [ ] # formating doc_id + c_score.SENTLINT + line_number sents = augmented_dict[int(item['id'])].values() # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= prob_threshold: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob']) ) # Important sentences for scaling training. Jul 21. # del sent_i['prob'] cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) cur_predicted_sentids = cur_predicted_sentids[:add_n] # if item['scored_sentids'] if len(item['predicted_sentids']) >= 5: continue else: item['predicted_sentids'].extend([ sid for sid, _, _ in cur_predicted_sentids if sid not in item['predicted_sentids'] ]) item['predicted_sentids'] = item['predicted_sentids'][:top_n] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) # item['predicted_label'] = item['label'] # give ground truth label return d_list
def model_perf_binned(dataset_name, task_name, data_file, model_prediction_file, split_type='quantile', bin_num=5, verbose=True): d_list = common.load_jsonl(data_file) collected_data_dict = list_dict_data_tool.list_to_dict(d_list, key_fields='uid') model_prediction_dict = common.load_json(model_prediction_file) bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type) bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict, task_name=task_name) if verbose: print('-' * 60) print('Data:', dataset_name) for model_name, range_items in bined_item_results.items(): print('Model: {:20s}'.format(model_name)) print('\t'.join(['{:18s}'.format('Entropy Range'), '{:15s}'.format('# of Example'), '{:10s}'.format('JSD'), '{:10s}'.format('KL'), '{:10s}'.format('Old Acc.'), '{:10s}'.format('New Acc.')])) for range_value, model_item in range_items['bin_results'].items(): print('\t'.join(['{:5f}-{:5f}'.format(range_value[0], range_value[1]), '{:15s}'.format(format_number(model_item['total_count'])), '{:10s}'.format(format_number(model_item['average JS div'])), '{:10s}'.format(format_number(model_item['average KL div'])), '{:10s}'.format(format_number(model_item['o_acc'])), '{:10s}'.format(format_number(model_item['m_acc'])), ])) print('-' * 60) return bined_item_results
def experiment_test_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 data_list = common.load_json(config.TEST_FULLWIKI_FILE) terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl" ) g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # WE need to give gt data None. doc_retri_pred_dict = init_results_v8( data_list, None, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) common.save_json( doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) common.save_json(new_doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8.json")
def spectrum_eval_manual_check(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-17-12:10:35_mesim_elmo/i(34800)_epoch(5)_dev(0.5563056305630563)_loss(1.6648460462434564)_seed(12)" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15:52:19_r/dev_sent.jsonl" IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16:34:19_r/dev_sent.jsonl" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16-34-19_r/dev_sent.jsonl" dev_sent_result_lsit = common.load_jsonl(IN_FILE) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) for sc_prob in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]: upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=sc_prob, delete_prob=False) dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) print("------------------------------------") print("Scaling_prob:", sc_prob) eval_mode = {'check_sent_id_correct': True, 'standard': True} print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode)) # del upstream_dev_list # del complete_upstream_dev_data del dev_fever_data_reader del dev_instances print("------------------------------------")
def score_converter_scaled(org_data_file, full_sent_list, scale_prob=0.5, delete_prob=True): """ :param org_data_file: :param full_sent_list: append full_sent_score list to evidence of original data file :param delete_prob: delete the probability for sanity check :param scale_prob: 0.5 :return: """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: # cur_predicted_sentids = [] cur_adv_predicted_sentids = [] else: # cur_predicted_sentids = [] # formating doc_id + c_score.SENTLINT + line_number cur_adv_predicted_sentids = [] sents = augmented_dict[int(item['id'])] # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= scale_prob: cur_adv_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob'])) # del sent_i['prob'] cur_adv_predicted_sentids = sorted(cur_adv_predicted_sentids, key=lambda x: -x[1]) item[ 'scored_sentids'] = cur_adv_predicted_sentids[: 5] # Important sentences for scaling training. Jul 21. item['predicted_sentids'] = [ sid for sid, _, _ in item['scored_sentids'] ][:5] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) item['predicted_label'] = item['label'] # give ground truth label # Removing all score and prob if delete_prob: for sent_item in full_sent_list: if 'score' in sent_item.keys(): del sent_item['score'] del sent_item['prob'] return d_list
def threshold_sampler_insure_unique(org_data_file, full_sent_list, prob_threshold=0.5, logist_threshold=None, top_n=5): """ Providing samples to the Training set by a probability threshold on the upstream selected sentences. """ d_list = common.load_jsonl(org_data_file) augmented_dict: Dict[int, Dict[str, Dict]] = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) remain_str = selection_id.split('<##>')[1] # doc_id = remain_str.split(c_scorer.SENT_LINE)[0] # ln = int(remain_str.split(c_scorer.SENT_LINE)[1]) if org_id in augmented_dict: if remain_str not in augmented_dict[org_id]: augmented_dict[org_id][remain_str] = sent_item else: print("Exist") else: augmented_dict[org_id] = {remain_str: sent_item} for item in d_list: if int(item['id']) not in augmented_dict: # print("Potential error?") cur_predicted_sentids = [] else: cur_predicted_sentids = [ ] # formating doc_id + c_score.SENTLINT + line_number sents = augmented_dict[int(item['id'])].values() # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= prob_threshold: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob']) ) # Important sentences for scaling training. Jul 21. # del sent_i['prob'] cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) item[ 'scored_sentids'] = cur_predicted_sentids[: top_n] # Important sentences for scaling training. Jul 21. item['predicted_sentids'] = [ sid for sid, _, _ in item['scored_sentids'] ][:top_n] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) # item['predicted_label'] = item['label'] # give ground truth label return d_list
def hidden_eval_fever(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)" dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) # dev_biterator = BasicIterator(batch_size=batch_size * 2) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 300), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} for item in builded_dev_data: del item['label'] print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
def eval_nli(): dev_list = common.load_jsonl(config.FEVER_DEV) # prediction_file = config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_cp_results_th0.2.jsonl" # prediction_file = config.PRO_ROOT / "saved_models/04-15-00:15:59_fever_v1_nli/i(18000)|e(2)|ss(0.6154615461546155)|ac(0.6701170117011701)|pr(0.26657540754071885)|rec(0.8852385238523852)|f1(0.40975857963668794)|seed(12)_dev_nli_results.json" prediction_file = config.PRO_ROOT / "data/p_fever/non_sent_level/ema_i(32000)|e(4)|ss(0.5592059205920592)|ac(0.6104110411041104)|pr(0.2638851385138135)|rec(0.8928142814281428)|f1(0.4073667130110584)|seed(12)_dev_nli_results.json" pred_list = common.load_jsonl(prediction_file) mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( pred_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item) fever_scorer.fever_confusion_matrix(pred_list, dev_list)
def get_sentence_forward_pair(tag, ruleterm_doc_results, is_training, debug=False, ignore_non_verifiable=False, top_k=5, filter_value=0.005): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] ruleterm_doc_results = ruleterm_doc_results[:100] # ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(ruleterm_doc_results, 'id') d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'id') copied_d_o_dict = copy.deepcopy(d_o_dict) # copied_d_list = copy.deepcopy(d_list) list_dict_data_tool.append_subfield_from_list_to_dict(ruleterm_doc_results, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_filtered = select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) db_cursor = fever_db.get_cursor() fitems = build_full_wiki_sentence_forward_item(cur_results_dict_filtered, d_list, is_training, db_cursor, ignore_non_verifiable) return fitems
def build_snli(path: Path): snli_data_root_path = (path / "snli") if not snli_data_root_path.exists(): snli_data_root_path.mkdir() o_train = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_train.jsonl") o_dev = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_dev.jsonl") o_test = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_test.jsonl") d_trian = sm_nli2std_format(o_train) d_dev = sm_nli2std_format(o_dev) d_test = sm_nli2std_format(o_test) print("SNLI examples without gold label have been filtered.") print("SNLI Train size:", len(d_trian)) print("SNLI Dev size:", len(d_dev)) print("SNLI Test size:", len(d_test)) common.save_jsonl(d_trian, snli_data_root_path / 'train.jsonl') common.save_jsonl(d_dev, snli_data_root_path / 'dev.jsonl') common.save_jsonl(d_test, snli_data_root_path / 'test.jsonl')
def build_mnli(path: Path): data_root_path = (path / "mnli") if not data_root_path.exists(): data_root_path.mkdir() o_train = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_train.jsonl") o_mm_dev = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl") o_m_dev = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_dev_matched.jsonl") d_trian = sm_nli2std_format(o_train) d_mm_dev = sm_nli2std_format(o_mm_dev) d_m_test = sm_nli2std_format(o_m_dev) print("MNLI examples without gold label have been filtered.") print("MNLI Train size:", len(d_trian)) print("MNLI MisMatched Dev size:", len(d_mm_dev)) print("MNLI Matched dev size:", len(d_m_test)) common.save_jsonl(d_trian, data_root_path / 'train.jsonl') common.save_jsonl(d_mm_dev, data_root_path / 'mm_dev.jsonl') common.save_jsonl(d_m_test, data_root_path / 'm_dev.jsonl')
def build_fever_nli(path: Path): data_root_path = (path / "fever_nli") if not data_root_path.exists(): data_root_path.mkdir() o_train = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/train_fitems.jsonl") o_dev = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/dev_fitems.jsonl") o_test = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/test_fitems.jsonl") d_trian = fever_nli2std_format(o_train) d_dev = fever_nli2std_format(o_dev) d_test = fever_nli2std_format(o_test) print("FEVER-NLI Train size:", len(d_trian)) print("FEVER-NLI Dev size:", len(d_dev)) print("FEVER-NLI Test size:", len(d_test)) common.save_jsonl(d_trian, data_root_path / 'train.jsonl') common.save_jsonl(d_dev, data_root_path / 'dev.jsonl') common.save_jsonl(d_test, data_root_path / 'test.jsonl')
def get_train_sentence_pair(top_k, is_training, debug=False, cur_train_eval_results_list=None): train_list = common.load_json(config.TRAIN_FILE) if cur_train_eval_results_list is None: cur_train_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl") if debug: train_list = train_list[:100] id_set = set([item['_id'] for item in train_list]) cur_train_eval_results_list = [item for item in cur_train_eval_results_list if item['qid'] in id_set] return get_sentence_pair(top_k, train_list, cur_train_eval_results_list, is_training)
def threshold_sampler(org_data_file, full_sent_list, prob_threshold=0.5, logist_threshold=None, top_n=5): """ Providing samples to the Training set by a probability threshold on the upstream selected sentences. """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: # change some logic to remove duplicate. augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: cur_predicted_sentids = [] else: cur_predicted_sentids = [ ] # formating doc_id + c_score.SENTLINT + line_number sents = augmented_dict[int(item['id'])] # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= prob_threshold: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob']) ) # Important sentences for scaling training. Jul 21. # del sent_i['prob'] cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) item[ 'scored_sentids'] = cur_predicted_sentids[: top_n] # Important sentences for scaling training. Jul 21. item['predicted_sentids'] = [ sid for sid, _, _ in item['scored_sentids'] ][:top_n] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) # item['predicted_label'] = item['label'] # give ground truth label return d_list
def score_converter(org_data_file, full_sent_list, top_k=5, prob_thr=0.5): """ Combines sentences of same claim :param org_data_file: :param full_sent_list: append full_sent_score list to evidence of original data file :param top_k: top k sentences to be retrieved :param prob_thr: probability threshold for retrieved sentences :return: """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item['selection_id'] org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: cur_predicted_sentids = [] else: cur_predicted_sentids = [] sents = augmented_dict[int(item['id'])] for sent_i in sents: if sent_i['prob'] >= prob_thr: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'])) cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) item['scored_sentids'] = cur_predicted_sentids item['predicted_sentids'] = [sid for sid, _ in item['scored_sentids'] ][:top_k] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) item['predicted_label'] = item[ 'label'] # give ground truth label (for OFEVER calculation) # Removing all score and prob for sent_item in full_sent_list: if 'score' in sent_item.keys(): del sent_item['score'] del sent_item['prob'] return d_list
def prepare_data_only_page_view(tokenized_file, eval_file, doc_retrieval_output_file): """ This method prepare document retrieval data using only page view. :return: """ doc_retrieval_method = 'pageview' print("Method:", doc_retrieval_method) haonan_docretri_object = HAONAN_DOCRETRI_OBJECT() doc_retrieval_result_list = first_doc_retrieval( haonan_docretri_object, tokenized_file, method=doc_retrieval_method, top_k=100) eval_list = common.load_jsonl(eval_file) disamb.item_resorting(doc_retrieval_result_list) print("Evaluating 1st Doc Retrieval") eval_mode = {'check_doc_id_correct': True, 'standard': False} print( c_scorer.fever_score(doc_retrieval_result_list, eval_list, mode=eval_mode, verbose=False)) print( "Max_doc_num_5:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=5)) print( "Max_doc_num_10:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=10)) print( "Max_doc_num_15:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=15)) print( "Max_doc_num_20:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=20)) # First Document retrieval End. common.save_jsonl(doc_retrieval_result_list, doc_retrieval_output_file)