def precompute_forward_items_and_cache(): # 3 places need to switch from dev to train !!! is_training = False doc_results = common.load_json( # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json") config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_test_doc_retrieval_v8_before_multihop_filtering.json" ) doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB) t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) # data_list = common.load_json(config.DEV_FULLWIKI_FILE) data_list = common.load_json(config.TEST_FULLWIKI_FILE) # data_list = common.load_json(config.TRAIN_FILE) append_baseline_context(doc_results, data_list) fitem_list = build_full_wiki_document_forward_item(doc_results, data_list, is_training, t_db_cursor, True) print(len(fitem_list)) common.save_jsonl( fitem_list, config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl")
def toy_init_results(): dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print(len(dev_fullwiki_list)) # Load rindex file abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() abs_rindexdb.score_db['default-tf-idf'] = dict() load_from_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # Load rindex finished saved_items = [] for item in tqdm(dev_fullwiki_list): saved_tfidf_item = dict() question = item['question'] qid = item['_id'] doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50) saved_tfidf_item['question'] = question saved_tfidf_item['qid'] = qid saved_tfidf_item['doc_list'] = doc_list saved_items.append(saved_tfidf_item) common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
def multi_process(start=0, end=None, tag='dev'): task_name = 'fever' debug = False top_k = 20 num_process = 3 query_fieldname = 'claim' id_fieldname = 'id' debug_name = 'debug' if debug else "" # print(multiprocessing.cpu_count()) print("CPU Count:", multiprocessing.cpu_count()) if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") print("Total length:", len(d_list)) # Important Set this number !!! # start, end = 0, None # Important End !!! print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}") d_list = d_list[start:end] print("Data length:", len(d_list)) if debug: d_list = d_list[:10] start, end = 0, 10 print("Data length (Pos-filtering):", len(d_list)) r_list = [] incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl" if incr_file.is_file(): print("Warning save file exists.") save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl" if save_path.is_file(): print("Warning save file exists.") with open(incr_file, mode='w', encoding='utf-8') as out_f: with Pool(processes=num_process, maxtasksperchild=1000) as pool: process_func = partial(process_fever_item_multiprocessing, top_k=top_k, query_field=query_fieldname, id_field=id_fieldname) p_item_list = pool.imap_unordered(process_func, d_list) for item in tqdm(p_item_list, total=len(d_list)): r_list.append(item) out_f.write(json.dumps(item) + '\n') out_f.flush() print(len(r_list)) common.save_jsonl(r_list, save_path)
def single_process_fever_with_dict(start=0, end=None, tag='dev'): task_name = 'fever' debug = False top_k = 20 query_fieldname = 'claim' id_fieldname = 'id' debug_name = 'debug' if debug else "" g_score_dict = dict() g_score_dict = load_from_file(g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") # Important Set this number !!! print("Total length:", len(d_list)) # start, end = 0, len(d_list) # Important End !!! print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}") d_list = d_list[start:end] print("Data length:", len(d_list)) if debug: d_list = d_list[:10] start, end = 0, 10 print("Data length (Pos-filtering):", len(d_list)) r_item_list = [] incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl" if incr_file.is_file(): print("Warning save file exists.") save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl" if save_path.is_file(): print("Warning save file exists.") with open(incr_file, mode='w', encoding='utf-8') as out_f: process_func = partial(process_fever_item_with_score_dict, top_k=top_k, query_field=query_fieldname, id_field=id_fieldname, global_score_dict=g_score_dict) for item in tqdm(d_list, total=len(d_list)): r_item = process_func(item) r_item_list.append(r_item) out_f.write(json.dumps(item) + '\n') out_f.flush() print(len(r_item_list)) common.save_jsonl(r_item_list, save_path)
def prepare_data_only_page_view(tokenized_file, eval_file, doc_retrieval_output_file): """ This method prepare document retrieval data using only page view. :return: """ doc_retrieval_method = 'pageview' print("Method:", doc_retrieval_method) haonan_docretri_object = HAONAN_DOCRETRI_OBJECT() doc_retrieval_result_list = first_doc_retrieval( haonan_docretri_object, tokenized_file, method=doc_retrieval_method, top_k=100) eval_list = common.load_jsonl(eval_file) disamb.item_resorting(doc_retrieval_result_list) print("Evaluating 1st Doc Retrieval") eval_mode = {'check_doc_id_correct': True, 'standard': False} print( c_scorer.fever_score(doc_retrieval_result_list, eval_list, mode=eval_mode, verbose=False)) print( "Max_doc_num_5:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=5)) print( "Max_doc_num_10:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=10)) print( "Max_doc_num_15:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=15)) print( "Max_doc_num_20:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=20)) # First Document retrieval End. common.save_jsonl(doc_retrieval_result_list, doc_retrieval_output_file)
def build_anli(path: Path, round=1, version='1.0'): data_root_path = (path / "anli") if not data_root_path.exists(): data_root_path.mkdir() round_tag = str(round) o_train = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/train.jsonl") o_dev = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/dev.jsonl") o_test = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/test.jsonl") d_trian = a_nli2std_format(o_train) d_dev = a_nli2std_format(o_dev) d_test = a_nli2std_format(o_test) print(f"ANLI (R{round_tag}) Train size:", len(d_trian)) print(f"ANLI (R{round_tag}) Dev size:", len(d_dev)) print(f"ANLI (R{round_tag}) Test size:", len(d_test)) if not (data_root_path / f"r{round_tag}").exists(): (data_root_path / f"r{round_tag}").mkdir() common.save_jsonl(d_trian, data_root_path / f"r{round_tag}" / 'train.jsonl') common.save_jsonl(d_dev, data_root_path / f"r{round_tag}" / 'dev.jsonl') common.save_jsonl(d_test, data_root_path / f"r{round_tag}" / 'test.jsonl')
def build_mnli(path: Path): data_root_path = (path / "mnli") if not data_root_path.exists(): data_root_path.mkdir() o_train = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_train.jsonl") o_mm_dev = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl") o_m_dev = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_dev_matched.jsonl") d_trian = sm_nli2std_format(o_train) d_mm_dev = sm_nli2std_format(o_mm_dev) d_m_test = sm_nli2std_format(o_m_dev) print("MNLI examples without gold label have been filtered.") print("MNLI Train size:", len(d_trian)) print("MNLI MisMatched Dev size:", len(d_mm_dev)) print("MNLI Matched dev size:", len(d_m_test)) common.save_jsonl(d_trian, data_root_path / 'train.jsonl') common.save_jsonl(d_mm_dev, data_root_path / 'mm_dev.jsonl') common.save_jsonl(d_m_test, data_root_path / 'm_dev.jsonl')
def build_fever_nli(path: Path): data_root_path = (path / "fever_nli") if not data_root_path.exists(): data_root_path.mkdir() o_train = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/train_fitems.jsonl") o_dev = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/dev_fitems.jsonl") o_test = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/test_fitems.jsonl") d_trian = fever_nli2std_format(o_train) d_dev = fever_nli2std_format(o_dev) d_test = fever_nli2std_format(o_test) print("FEVER-NLI Train size:", len(d_trian)) print("FEVER-NLI Dev size:", len(d_dev)) print("FEVER-NLI Test size:", len(d_test)) common.save_jsonl(d_trian, data_root_path / 'train.jsonl') common.save_jsonl(d_dev, data_root_path / 'dev.jsonl') common.save_jsonl(d_test, data_root_path / 'test.jsonl')
def build_snli(path: Path): snli_data_root_path = (path / "snli") if not snli_data_root_path.exists(): snli_data_root_path.mkdir() o_train = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_train.jsonl") o_dev = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_dev.jsonl") o_test = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_test.jsonl") d_trian = sm_nli2std_format(o_train) d_dev = sm_nli2std_format(o_dev) d_test = sm_nli2std_format(o_test) print("SNLI examples without gold label have been filtered.") print("SNLI Train size:", len(d_trian)) print("SNLI Dev size:", len(d_dev)) print("SNLI Test size:", len(d_test)) common.save_jsonl(d_trian, snli_data_root_path / 'train.jsonl') common.save_jsonl(d_dev, snli_data_root_path / 'dev.jsonl') common.save_jsonl(d_test, snli_data_root_path / 'test.jsonl')
def pipeline(in_file, eval_file=None, model_path_dict=default_model_path_dict, steps=default_steps): """ :param in_file: The raw input file. :param eval_file: Whether to provide evaluation along the line. :return: """ sentence_retri_1_scale_prob = 0.5 sentence_retri_2_scale_prob = 0.9 sent_retri_1_top_k = 5 sent_retri_2_top_k = 1 sent_prob_for_2doc = 0.1 sent_topk_for_2doc = 5 enhance_retri_1_scale_prob = -1 build_submission = True doc_retrieval_method = 'word_freq' haonan_docretri_object = HAONAN_DOCRETRI_OBJECT() if not PIPELINE_DIR.exists(): PIPELINE_DIR.mkdir() if steps['s1.tokenizing']['do']: time_stamp = utils.get_current_time_str() current_pipeline_dir = PIPELINE_DIR / f"{time_stamp}_r" else: current_pipeline_dir = steps['s1.tokenizing']['out_file'].parent print("Current Result Root:", current_pipeline_dir) if not current_pipeline_dir.exists(): current_pipeline_dir.mkdir() eval_list = common.load_jsonl(eval_file) if eval_file is not None else None in_file_stem = in_file.stem tokenized_file = current_pipeline_dir / f"t_{in_file_stem}.jsonl" # Save code into directory script_name = os.path.basename(__file__) with open(os.path.join(str(current_pipeline_dir), script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Tokenizing. print("Step 1. Tokenizing.") if steps['s1.tokenizing']['do']: tokenized_claim(in_file, tokenized_file) # Auto Saved print("Tokenized file saved to:", tokenized_file) else: tokenized_file = steps['s1.tokenizing']['out_file'] print("Use preprocessed file:", tokenized_file) # Tokenizing End. # First Document retrieval. print("Step 2. First Document Retrieval") if steps['s2.1doc_retri']['do']: doc_retrieval_result_list = first_doc_retrieval( haonan_docretri_object, tokenized_file, method=doc_retrieval_method) doc_retrieval_file_1 = current_pipeline_dir / f"doc_retr_1_{in_file_stem}.jsonl" common.save_jsonl(doc_retrieval_result_list, doc_retrieval_file_1) print("First Document Retrieval file saved to:", doc_retrieval_file_1) else: doc_retrieval_file_1 = steps['s2.1doc_retri']['out_file'] doc_retrieval_result_list = common.load_jsonl(doc_retrieval_file_1) print("Use preprocessed file:", doc_retrieval_file_1) if eval_list is not None: print("Evaluating 1st Doc Retrieval") eval_mode = {'check_doc_id_correct': True, 'standard': False} print( c_scorer.fever_score(doc_retrieval_result_list, eval_list, mode=eval_mode, verbose=False)) # First Document retrieval End. # First Sentence Selection. print("Step 3. First Sentence Selection") if steps['s3.1sen_select']['do']: dev_sent_list_1_e0 = simple_nnmodel.pipeline_first_sent_selection( tokenized_file, doc_retrieval_file_1, model_path_dict['sselector']) dev_sent_file_1_e0 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}.jsonl" common.save_jsonl(dev_sent_list_1_e0, dev_sent_file_1_e0) # Manual setting, delete it later # dev_sent_file_1_e0 = None # dev_sent_list_1_e0 = common.load_jsonl("/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_selector_em/dev_sent_score_1_shared_task_test.jsonl") # End if steps['s3.1sen_select']['ensemble']: print("Ensemble!") dev_sent_list_1_e1 = simple_nnmodel.pipeline_first_sent_selection( tokenized_file, doc_retrieval_file_1, model_path_dict['sselector_1']) dev_sent_file_1_e1 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_e1.jsonl" common.save_jsonl(dev_sent_list_1_e1, dev_sent_file_1_e1) # exit(0) # dev_sent_list_1_e1 = common.load_jsonl(dev_sent_file_1_e1) dev_sent_list_1_e2 = simple_nnmodel.pipeline_first_sent_selection( tokenized_file, doc_retrieval_file_1, model_path_dict['sselector_2']) dev_sent_file_1_e2 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_e2.jsonl" common.save_jsonl(dev_sent_list_1_e2, dev_sent_file_1_e2) # exit(0) # dev_sent_list_1_e2 = common.load_jsonl(dev_sent_file_1_e2) dev_sent_list_1 = merge_sent_results( [dev_sent_list_1_e0, dev_sent_list_1_e1, dev_sent_list_1_e2]) dev_sent_file_1 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_ensembled.jsonl" common.save_jsonl(dev_sent_list_1, dev_sent_file_1) # exit(0) else: dev_sent_list_1 = dev_sent_list_1_e0 dev_sent_file_1 = dev_sent_file_1_e0 # Merging two results print("First Sentence Selection file saved to:", dev_sent_file_1) else: dev_sent_file_1 = steps['s3.1sen_select']['out_file'] dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) print("Use preprocessed file:", dev_sent_file_1) # exit(0) if eval_list is not None: print("Evaluating 1st Sentence Selection") # sent_select_results_list_1 = simi_sampler.threshold_sampler(tokenized_file, dev_sent_full_list, # sentence_retri_scale_prob, top_n=5) # additional_dev_sent_list = common.load_jsonl("/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_20_15-17-59_r/dev_sent_2r.jsonl") # dev_sent_full_list = dev_sent_full_list + additional_dev_sent_list sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique( tokenized_file, dev_sent_list_1, sentence_retri_1_scale_prob, top_n=sent_retri_1_top_k) # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1, # additional_dev_sent_list, # sentence_retri_2_scale_prob, # top_n=5, add_n=1) eval_mode = {'check_sent_id_correct': True, 'standard': False} # for a, b in zip(eval_list, sent_select_results_list_1): # b['predicted_label'] = a['label'] print( c_scorer.fever_score(sent_select_results_list_1, eval_list, mode=eval_mode, verbose=False)) print("Step 4. Second Document Retrieval") if steps['s4.2doc_retri']['do']: dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) filtered_dev_instance_1_for_doc2 = simi_sampler.threshold_sampler_insure_unique( tokenized_file, dev_sent_list_1, sent_prob_for_2doc, top_n=sent_topk_for_2doc) filtered_dev_instance_1_for_doc2_file = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_scaled_for_doc2.jsonl" common.save_jsonl(filtered_dev_instance_1_for_doc2, filtered_dev_instance_1_for_doc2_file) dev_sent_1_result = simi_sampler.threshold_sampler_insure_unique( doc_retrieval_file_1, # Remember this name dev_sent_list_1, sentence_retri_1_scale_prob, top_n=sent_topk_for_2doc) dev_doc2_list = second_doc_retrieval( haonan_docretri_object, filtered_dev_instance_1_for_doc2_file, dev_sent_1_result) dev_doc2_file = current_pipeline_dir / f"doc_retr_2_{in_file_stem}.jsonl" common.save_jsonl(dev_doc2_list, dev_doc2_file) print("Second Document Retrieval File saved to:", dev_doc2_file) else: dev_doc2_file = steps['s4.2doc_retri']['out_file'] # dev_doc2_list = common.load_jsonl(dev_doc2_file) print("Use preprocessed file:", dev_doc2_file) print("Step 5. Second Sentence Selection") if steps['s5.2sen_select']['do']: dev_sent_2_list = get_score_multihop( tokenized_file, dev_doc2_file, model_path=model_path_dict['sselector']) dev_sent_file_2 = current_pipeline_dir / f"dev_sent_score_2_{in_file_stem}.jsonl" common.save_jsonl(dev_sent_2_list, dev_sent_file_2) print("First Sentence Selection file saved to:", dev_sent_file_2) else: dev_sent_file_2 = steps['s5.2sen_select']['out_file'] if eval_list is not None: print("Evaluating 1st Sentence Selection") dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique( tokenized_file, dev_sent_list_1, sentence_retri_1_scale_prob, top_n=5) sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_merge( sent_select_results_list_1, dev_sent_list_2, sentence_retri_2_scale_prob, top_n=5, add_n=sent_retri_2_top_k) eval_mode = {'check_sent_id_correct': True, 'standard': False} # for a, b in zip(eval_list, sent_select_results_list_1): # b['predicted_label'] = a['label'] print( c_scorer.fever_score(sent_select_results_list_1, eval_list, mode=eval_mode, verbose=False)) # print("Step 6. NLI") # if steps['s6.nli']['do']: # dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) # dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) # sentence_retri_1_scale_prob = 0.05 # print("Threshold:", sentence_retri_1_scale_prob) # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(tokenized_file, dev_sent_list_1, # sentence_retri_1_scale_prob, top_n=5) # # sent_select_results_list_2 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1, # # dev_sent_list_2, # # sentence_retri_2_scale_prob, # # top_n=5, # # add_n=sent_retri_2_top_k) # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run(tokenized_file, # sent_select_results_list_1, # [dev_sent_file_1, dev_sent_file_2], # model_path_dict['nli'], # with_logits=True, # with_probs=True) # # nli_results_file = current_pipeline_dir / f"nli_r_{in_file_stem}.jsonl" # common.save_jsonl(nli_results, nli_results_file) # else: # nli_results_file = steps['s6.nli']['out_file'] # nli_results = common.load_jsonl(nli_results_file) # Ensemble code # dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) # dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) # sentence_retri_1_scale_prob = 0.05 # print("NLI sentence threshold:", sentence_retri_1_scale_prob) # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(tokenized_file, dev_sent_list_1, # sentence_retri_1_scale_prob, top_n=5) # # # sent_select_results_list_2 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1, # # dev_sent_list_2, # # sentence_retri_2_scale_prob, # # top_n=5, # # add_n=sent_retri_2_top_k) # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run(tokenized_file, # # sent_select_results_list_1, # # [dev_sent_file_1, dev_sent_file_2], # # model_path_dict['nli'], with_probs=True, with_logits=True) # # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run_bigger(tokenized_file, # # sent_select_results_list_1, # # [dev_sent_file_1, dev_sent_file_2], # # model_path_dict['nli_2'], # # with_probs=True, # # with_logits=True) # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run_bigger(tokenized_file, # sent_select_results_list_1, # [dev_sent_file_1, dev_sent_file_2], # model_path_dict['nli_4'], # with_probs=True, # with_logits=True) # # nli_results_file = current_pipeline_dir / f"nli_r_{in_file_stem}_withlb_e4.jsonl" # common.save_jsonl(nli_results, nli_results_file) # Ensemble code end # exit(0) nli_r_e0 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e0.jsonl") nli_r_e1 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e1.jsonl") nli_r_e2 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e2.jsonl") nli_r_e3 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e3.jsonl") nli_r_e4 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e4.jsonl") nli_results = merge_nli_results( [nli_r_e0, nli_r_e1, nli_r_e2, nli_r_e3, nli_r_e4]) print("Post Processing enhancement") delete_unused_evidence(nli_results) print("Deleting Useless Evidence") dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) print("Appending 1 of second Evidence") nli_results = simi_sampler.threshold_sampler_insure_unique_merge( nli_results, dev_sent_list_2, sentence_retri_2_scale_prob, top_n=5, add_n=sent_retri_2_top_k) delete_unused_evidence(nli_results) # High tolerance enhancement! print("Final High Tolerance Enhancement") print("Appending all of first Evidence") nli_results = simi_sampler.threshold_sampler_insure_unique_merge( nli_results, dev_sent_list_1, enhance_retri_1_scale_prob, top_n=100, add_n=100) delete_unused_evidence(nli_results) if build_submission: output_file = current_pipeline_dir / "predictions.jsonl" build_submission_file(nli_results, output_file)
def model_eval(model_save_path): seed = 6 bert_model_name = 'bert-base-uncased' lazy = False forward_size = 16 batch_size = 32 # dev_prob_threshold = 0.05 dev_prob_threshold = 0.1 num_class = 3 # num_train_optimization_steps torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Finished build vocabulary. # Load standardized sentence file # dev_upstream_sent_list = common.load_jsonl(config.RESULT_PATH / # "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json") # dev_upstream_sent_list = common.load_jsonl(config.DATA_ROOT / # "utest_data/dev_sent_score_2_shared_task_dev.jsonl") # "utest_data/dev_sent_score_1_shared_task_dev_docnum(10)_ensembled.jsonl") # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / # "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl") dev_upstream_sent_list = common.load_jsonl( config.FEVER_DATA_ROOT / "upstream_sentence_selection_Feb16/4-15-dev_sent_pred_scores.jsonl") # # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / # "upstream_sentence_selection_Feb16/4-15-test_sent_pred_scores.jsonl") # dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / # "upstream_sentence_selection_Feb16/n_dev_sent_pred_scores.jsonl") # dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique_new_format( dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique( config.FEVER_DEV, dev_upstream_sent_list, prob_threshold=dev_prob_threshold, top_n=5) dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval( config.FEVER_DEV, dev_sent_after_threshold_filter, None, tokenized=True) # dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique( # config.FEVER_TEST, # dev_upstream_sent_list, # prob_threshold=dev_prob_threshold, top_n=5) # # dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval( # config.FEVER_TEST, dev_sent_after_threshold_filter, # None, tokenized=True, pipeline=True) for item in dev_data_list: item['label'] = 'hidden' dev_list = common.load_jsonl(config.FEVER_DEV) for a, b in zip(dev_list, dev_data_list): del b['label'] b['predicted_label'] = a['label'] eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score( dev_data_list, dev_list, mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}") bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True) bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy) dev_instances = bert_fever_reader.read(dev_data_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # print(list(mnli_dev_instances)) # Load training model # Load training model model_clf = BertForSequenceClassification.from_pretrained( bert_model_name, num_labels=num_class) model_clf.load_state_dict(torch.load(model_save_path)) model_clf.to(device) model_clf.eval() if n_gpu > 1: model_clf = nn.DataParallel(model_clf) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) # for item in dev_data_list: dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device) common.save_jsonl( dev_data_list, config.PRO_ROOT / "data/fever/upstream_sentence_selection_Feb16/4-15-dev_nli_results.jsonl" ) eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score( dev_data_list, common.load_jsonl(config.FEVER_DEV), mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}")
retri_list = [] for item in tqdm(fullwiki_list): saved_tfidf_item = dict() question = item['question'] qid = item['_id'] doc_list = lucene_retri_doc(question, top_k=50) saved_tfidf_item['question'] = question saved_tfidf_item['qid'] = qid saved_tfidf_item['doc_list'] = doc_list retri_list.append(saved_tfidf_item) return retri_list if __name__ == '__main__': lucene_indexing() print("retrieve train set ...") saved_items = term_based_doc_retri(config.TRAIN_FILE) common.save_jsonl(saved_items, config.TRAIN_TERM_BASED) print("retrieve dev set ...") saved_items = term_based_doc_retri(config.DEV_FULLWIKI_FILE) common.save_jsonl(saved_items, config.DEV_TERM_BASED) print("retrieve test set ...") saved_items = term_based_doc_retri(config.TEST_FULLWIKI_FILE) common.save_jsonl(saved_items, config.TEST_TERM_BASED)
sent_level_results_list=train_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=train_sent_filtering_prob) test_fitems, test_list = get_nli_pair( 'test', is_training=False, sent_level_results_list=test_sent_results_list, debug=debug_mode, sent_top_k=5, sent_filter_value=test_sent_filtering_prob) print(dev_fitems[0]) common.save_jsonl( dev_fitems, config.PRO_ROOT / "data/p_fever/intermediate_sent_data/dev_fitems.jsonl") common.save_jsonl( dev_list, config.PRO_ROOT / "data/p_fever/intermediate_sent_data/dev_list.jsonl") common.save_jsonl( train_fitems, config.PRO_ROOT / "data/p_fever/intermediate_sent_data/train_fitems.jsonl") common.save_jsonl( train_list, config.PRO_ROOT / "data/p_fever/intermediate_sent_data/train_list.jsonl") common.save_jsonl( test_fitems, config.PRO_ROOT / "data/p_fever/intermediate_sent_data/test_fitems.jsonl")
# print(sfile) a = pattern.fullmatch(str(sfile.name)) if a is None: continue file_list.append((int(a.group(1)), sfile)) with open(filename, encoding='utf-8', mode='w') as out_f: for _, the_file in sorted(file_list, key=lambda x: x[0]): print(the_file) with open(the_file, encoding='utf-8', mode='r') as in_f: for line in in_f: out_f.write(line) if __name__ == '__main__': dev_d = common.load_jsonl(config.T_FEVER_DEV_JSONL) train_d = common.load_jsonl(config.T_FEVER_TRAIN_JSONL) dt_d = dev_d + train_d common.save_jsonl(dt_d, config.T_FEVER_DT_JSONL) # split_by_line("/Users/Eason/RA/FunEver/utest/utest_data/test_rand_data.txt", # out_path="/Users/Eason/RA/FunEver/utest/utest_data/test_rand_data_1.txt.splits") # # merge_by_line('/Users/Eason/RA/FunEver/utest/utest_data/test_rand_data_1.txt') # merge_by_line('/Users/Eason/RA/FunEver/results/sent_retri/2018_07_05_17:17:50_r/train') # split_by_line("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl", # out_path="/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/o_train.splits", # lines=20000)
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'): # term_retrieval_top_k = 20 # term_retrieval_top_k = 20 # term_retrieval_top_k = 3 # match_filtering_k = 2 if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") d_tf_idf = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl" ) tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id') r_list = [] ner_set = get_title_entity_set() g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) for item in tqdm(d_list): cur_id = str(item['id']) query = item['claim'] query_terms = get_query_ngrams(query) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # print(tf_idf_doc_list) get_kw_matching_results(query, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list'] added_count = 0 for score, title in sorted(tf_idf_doc_list, key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id( title) and not title.startswith('List of '): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break predicted_docids = retrieved_set.to_id_list() # print(retrieved_set) # print(item['claim'], predicted_docids) r_item = dict() r_item['id'] = int(cur_id) r_item['claim'] = item['claim'] r_item['predicted_docids'] = predicted_docids if tag != 'test': r_item['label'] = item['label'] r_list.append(r_item) # r_list = common.load_jsonl('dev-debug.jsonl') # We need to modify the existing retrieved document for naming consistency for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # Modify finished # print(r_list[0:10]) len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list)) common.save_jsonl( r_list, f'fever_term_based_retri_results_' f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) mode = {'standard': False, 'check_doc_id_correct': True} # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
def train_fever_std_ema_v1(resume_model=None, do_analysis=False): """ This method is created on 26 Nov 2018 08:50 with the purpose of training vc and ss all together. :param resume_model: :param wn_feature: :return: """ num_epoch = 200 seed = 12 batch_size = 32 lazy = True train_prob_threshold = 0.02 train_sample_top_k = 8 dev_prob_threshold = 0.1 dev_sample_top_k = 5 top_k_doc = 5 schedule_sample_dict = defaultdict(lambda: 0.1) ratio_ss_for_vc = 0.2 schedule_sample_dict.update({ 0: 0.1, 1: 0.1, # 200k + 400K 2: 0.1, 3: 0.1, # 200k + 200k ~ 200k + 100k 4: 0.1, 5: 0.1, # 200k + 100k 6: 0.1 # 20k + 20k }) # Eval at beginning of the training. eval_full_epoch = 1 eval_nei_epoches = [2, 3, 4, 5, 6, 7] neg_only = False debug = False experiment_name = f"vc_ss_v17_ratio_ss_for_vc:{ratio_ss_for_vc}|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}_scheduled_neg_sampler" # resume_model = None print("Do EMA:") print("Dev prob threshold:", dev_prob_threshold) print("Train prob threshold:", train_prob_threshold) print("Train sample top k:", train_sample_top_k) # Get upstream sentence document retrieval data dev_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" train_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl" complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_doc_upstream_file, pred=True, top_k=top_k_doc) complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL, train_doc_upstream_file, pred=False, top_k=top_k_doc) if debug: complete_upstream_dev_data = complete_upstream_dev_data[:1000] complete_upstream_train_data = complete_upstream_train_data[:1000] print("Dev size:", len(complete_upstream_dev_data)) print("Train size:", len(complete_upstream_train_data)) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } # Data Reader dev_fever_data_reader = VCSS_Reader(token_indexers=token_indexers, lazy=lazy, max_l=260) train_fever_data_reader = VCSS_Reader(token_indexers=token_indexers, lazy=lazy, max_l=260) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.add_token_to_namespace('true', namespace='labels') vocab.add_token_to_namespace('false', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Reader and prepare end vc_ss_training_sampler = VCSSTrainingSampler(complete_upstream_train_data) vc_ss_training_sampler.show_info() # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + 1, 1024 + 450 + 1), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300, num_of_class=4) print("Model Max length:", model.max_l) if resume_model is not None: model.load_state_dict(torch.load(resume_model)) model.display() model.to(device) cloned_empty_model = copy.deepcopy(model) ema: EMA = EMA(parameters=model.named_parameters()) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() analysis_dir = None if do_analysis: analysis_dir = Path(file_path_prefix) / "analysis_aux" analysis_dir.mkdir() # Save source code end. # Staring parameter setup best_dev = -1 iteration = 0 start_lr = 0.0001 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() # parameter setup end for i_epoch in range(num_epoch): print("Resampling...") # This is for train # This is for sample candidate data for from result of ss for vc. # This we will need to do after each epoch. if i_epoch == eval_full_epoch: # only eval at 1 print("We now need to eval the whole training set.") print("Be patient and hope good luck!") load_ema_to_model(cloned_empty_model, ema) eval_sent_for_sampler(cloned_empty_model, token_indexers, vocab, vc_ss_training_sampler) elif i_epoch in eval_nei_epoches: # at 2, 3, 4 eval for NEI print("We now need to eval the NEI training set.") print("Be patient and hope good luck!") load_ema_to_model(cloned_empty_model, ema) eval_sent_for_sampler(cloned_empty_model, token_indexers, vocab, vc_ss_training_sampler, nei_only=True) train_data_with_candidate_sample_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei( config.T_FEVER_TRAIN_JSONL, vc_ss_training_sampler.sent_list, train_prob_threshold, train_sample_top_k) # We initialize the prob for each sentence so the sampler can work, but we will need to run the model for dev data to work. train_selection_dict = paired_selection_score_dict( vc_ss_training_sampler.sent_list) cur_train_vc_data = adv_simi_sample_with_prob_v1_1( config.T_FEVER_TRAIN_JSONL, train_data_with_candidate_sample_list, train_selection_dict, tokenized=True) if do_analysis: # Customized analysis output common.save_jsonl( vc_ss_training_sampler.sent_list, analysis_dir / f"E_{i_epoch}_whole_train_sent_{save_tool.get_cur_time_str()}.jsonl" ) common.save_jsonl( train_data_with_candidate_sample_list, analysis_dir / f"E_{i_epoch}_sampled_train_sent_{save_tool.get_cur_time_str()}.jsonl" ) common.save_jsonl( cur_train_vc_data, analysis_dir / f"E_{i_epoch}_train_vc_data_{save_tool.get_cur_time_str()}.jsonl" ) print(f"E{i_epoch} VC_data:", len(cur_train_vc_data)) # This is for sample negative candidate data for ss # After sampling, we decrease the ratio. neg_sample_upper_prob = schedule_sample_dict[i_epoch] print("Neg Sampler upper rate:", neg_sample_upper_prob) # print("Rate decreasing") # neg_sample_upper_prob -= decay_r neg_sample_upper_prob = max(0.000, neg_sample_upper_prob) cur_train_ss_data = vc_ss_training_sampler.sample_for_ss( neg_only=neg_only, upper_prob=neg_sample_upper_prob) if i_epoch >= 1: # if epoch num >= 6 we balance pos and neg example for selection # new_ss_data = [] pos_ss_data = [] neg_ss_data = [] for item in cur_train_ss_data: if item['selection_label'] == 'true': pos_ss_data.append(item) elif item['selection_label'] == 'false': neg_ss_data.append(item) ss_sample_size = min(len(pos_ss_data), len(neg_ss_data)) random.shuffle(pos_ss_data) random.shuffle(neg_ss_data) cur_train_ss_data = pos_ss_data[:int( ss_sample_size * 0.5)] + neg_ss_data[:ss_sample_size] random.shuffle(cur_train_ss_data) vc_ss_training_sampler.show_info(cur_train_ss_data) print(f"E{i_epoch} SS_data:", len(cur_train_ss_data)) vc_ss.data_wrangler.assign_task_label(cur_train_ss_data, 'ss') vc_ss.data_wrangler.assign_task_label(cur_train_vc_data, 'vc') vs_ss_train_list = cur_train_ss_data + cur_train_vc_data random.shuffle(vs_ss_train_list) print(f"E{i_epoch} Total ss+vc:", len(vs_ss_train_list)) vc_ss_instance = train_fever_data_reader.read(vs_ss_train_list) train_iter = biterator(vc_ss_instance, shuffle=True, num_epochs=1) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) if i_epoch >= 1: ratio_ss_for_vc = 0.8 loss = compute_mixing_loss( model, out, batch, criterion, vc_ss_training_sampler, ss_for_vc_prob=ratio_ss_for_vc) # Important change # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 # EMA update ema(model.named_parameters()) if i_epoch < 9: mod = 10000 # mod = 100 else: mod = 2000 if iteration % mod == 0: # This is the code for eval: load_ema_to_model(cloned_empty_model, ema) vc_ss.data_wrangler.assign_task_label( complete_upstream_dev_data, 'ss') dev_ss_instance = dev_fever_data_reader.read( complete_upstream_dev_data) eval_ss_iter = biterator(dev_ss_instance, num_epochs=1, shuffle=False) scored_dev_sent_data = hidden_eval_ss( cloned_empty_model, eval_ss_iter, complete_upstream_dev_data) # for vc filtered_dev_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei( config.T_FEVER_DEV_JSONL, scored_dev_sent_data, dev_prob_threshold, dev_sample_top_k) dev_selection_dict = paired_selection_score_dict( scored_dev_sent_data) ready_dev_list = select_sent_with_prob_for_eval( config.T_FEVER_DEV_JSONL, filtered_dev_list, dev_selection_dict, tokenized=True) vc_ss.data_wrangler.assign_task_label(ready_dev_list, 'vc') dev_vc_instance = dev_fever_data_reader.read(ready_dev_list) eval_vc_iter = biterator(dev_vc_instance, num_epochs=1, shuffle=False) eval_dev_result_list = hidden_eval_vc(cloned_empty_model, eval_vc_iter, ready_dev_list) # Scoring eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( eval_dev_result_list, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) print(f"Dev:{strict_score}/{acc_score}") if do_analysis: # Customized analysis output common.save_jsonl( scored_dev_sent_data, analysis_dir / f"E_{i_epoch}_scored_dev_sent_{save_tool.get_cur_time_str()}.jsonl" ) common.save_jsonl( eval_dev_result_list, analysis_dir / f"E_{i_epoch}_eval_vc_output_data_{save_tool.get_cur_time_str()}.jsonl" ) need_save = False if strict_score > best_dev: best_dev = strict_score need_save = True if need_save or i_epoch < 7: # save_path = os.path.join( # file_path_prefix, # f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})' # ) # torch.save(model.state_dict(), save_path) ema_save_path = os.path.join( file_path_prefix, f'ema_i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_p({pr})_r({rec})_f1({f1})_seed({seed})' ) save_ema_to_file(ema, ema_save_path)
kw_terms = list(kw_terms) kw_terms_total_size = len(kw_terms) for start in range(0, kw_terms_total_size, chuck_size): print(start, start + chuck_size) current_kw_terms = kw_terms[start:start + chuck_size] keyword_processor = KeywordProcessor(case_sensitive=True) for word in tqdm(current_kw_terms): keyword_processor.add_keyword(word) for item in tqdm(d_list): query = item['question'] terms = query_get_terms(query, keyword_processor) if 'kw_matches' not in item: item['kw_matches'] = [] item['kw_matches'].extend(terms) del keyword_processor return d_list if __name__ == '__main__': kw_terms = load_terms(config.PDATA_ROOT / "reverse_indexing/terms.txt") # d_list = common.load_json(config.DEV_FULLWIKI_FILE) d_list = common.load_json(config.TRAIN_FILE) d_list = get_kwterm_matching(kw_terms, d_list) # common.save_jsonl(d_list, config.RESULT_PATH / "kw_term_match_result/dev_term_match_result.jsonl") common.save_jsonl( d_list, config.RESULT_PATH / "kw_term_match_result/train_term_match_result.jsonl")
def eval_for_remaining(): batch_size = 128 lazy = True # SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-16-11:37:07_simple_nn/i(25000)_epoch(1)_(tra_score:0.8188318831883188|clf_acc:95.67680650034835|pr:0.7394326932693269|rec:0.7282478247824783|f1:0.7337976403219241|loss:0.11368581993118955)" SAVE_PATH = config.PRO_ROOT / "saved_models/saved_sselector/i(57167)_epoch(6)_(tra_score:0.8850885088508851|raw_acc:1.0|pr:0.3834395939593578|rec:0.8276327632763276|f1:0.5240763176570098)_epoch" # SAVE_PATH = config.PRO_ROOT / "saved_models/07-20-01:35:16_simple_nn_startkp_0.4_de_0.05/i(53810)_epoch(4)_(tra_score:0.8577357735773578|raw_acc:1.0|pr:0.671477147714762|rec:0.7866036603660366|f1:0.7244953493898653)_epoch" print("Model From:", SAVE_PATH) # dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # dev_upstream_file = config.RESULT_PATH / "doc_retri/cn_util_Jul17_docretri.singularize/dev.jsonl" # dev_upstream_file = config.RESULT_PATH / "doc_retri/docretri.pageview/dev.jsonl" # dev_upstream_file = config.RESULT_PATH / "doc_retri/docretri.pageview/train.jsonl" # # SAVE_RESULT_TARGET_FOLDER.mkdir() incoming_data_file = config.RESULT_PATH / "sent_retri_nn/remaining_training_cache/dev_s.jsonl" incoming_data = common.load_jsonl(incoming_data_file) SAVE_RESULT_TARGET_FOLDER = config.RESULT_PATH / "sent_retri_nn/remaining_training_cache" # out_file_name = "dev_sent.jsonl" out_file_name = "remain_dev_sent.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) # complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True) # complete_upstream_dev_data = get_full_list(config.T_FEVER_TRAIN_JSONL, dev_upstream_file, pred=True) print("Dev size:", len(incoming_data)) dev_instances = dev_fever_data_reader.read(incoming_data) # Load Vocabulary dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) complete_upstream_dev_data = hidden_eval(model, eval_iter, incoming_data) common.save_jsonl(complete_upstream_dev_data, SAVE_RESULT_TARGET_FOLDER / out_file_name) total = 0 hit = 0 for item in complete_upstream_dev_data: assert item['selection_label'] == 'true' if item['prob'] >= 0.5: hit += 1 total += 1 print(hit, total, hit / total)
def evaluation(): parser = argparse.ArgumentParser() parser.add_argument("--cpu", action="store_true", help="If set, we only use CPU.") parser.add_argument("--model_class_name", type=str, help="Set the model class of the experiment.", required=True) parser.add_argument("--model_checkpoint_path", type=str, help='Set the path to save the prediction.', required=True) parser.add_argument("--output_prediction_path", type=str, default=None, help='Set the path to save the prediction.') parser.add_argument( "--per_gpu_eval_batch_size", default=16, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument("--max_length", default=156, type=int, help="Max length of the sequences.") parser.add_argument("--eval_data", type=str, help="The training data used in the experiments.") args = parser.parse_args() if args.cpu: args.global_rank = -1 else: args.global_rank = 0 model_checkpoint_path = args.model_checkpoint_path num_labels = 3 # we are doing NLI so we set num_labels = 3, for other task we can change this value. max_length = args.max_length model_class_item = MODEL_CLASSES[args.model_class_name] model_name = model_class_item['model_name'] do_lower_case = model_class_item[ 'do_lower_case'] if 'do_lower_case' in model_class_item else False tokenizer = model_class_item['tokenizer'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), do_lower_case=do_lower_case) model = model_class_item['sequence_classification'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), num_labels=num_labels) model.load_state_dict(torch.load(model_checkpoint_path)) padding_token_value = tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0] padding_segement_value = model_class_item["padding_segement_value"] padding_att_value = model_class_item["padding_att_value"] left_pad = model_class_item[ 'left_pad'] if 'left_pad' in model_class_item else False batch_size_per_gpu_eval = args.per_gpu_eval_batch_size eval_data_str = args.eval_data eval_data_name = [] eval_data_path = [] eval_data_list = [] eval_data_named_path = eval_data_str.split(',') for named_path in eval_data_named_path: ind = named_path.find(':') name = named_path[:ind] path = name[ind + 1:] if name in registered_path: d_list = common.load_jsonl(registered_path[name]) else: d_list = common.load_jsonl(path) eval_data_name.append(name) eval_data_path.append(path) eval_data_list.append(d_list) batching_schema = { 'uid': RawFlintField(), 'y': LabelFlintField(), 'input_ids': ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad), 'token_type_ids': ArrayIndexFlintField(pad_idx=padding_segement_value, left_pad=left_pad), 'attention_mask': ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad), } data_transformer = NLITransform(model_name, tokenizer, max_length) eval_data_loaders = [] for eval_d_list in eval_data_list: d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler( eval_d_list, data_transformer, batching_schema, batch_size_per_gpu_eval) eval_data_loaders.append(d_dataloader) if not args.cpu: torch.cuda.set_device(0) model.cuda(0) r_dict = dict() # Eval loop: for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] cur_eval_data_list = eval_data_list[i] cur_eval_dataloader = eval_data_loaders[i] # cur_eval_raw_data_list = eval_raw_data_list[i] evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict, eval_name=cur_eval_data_name) # save prediction: if args.output_prediction_path is not None: cur_results_path = Path(args.output_prediction_path) if not cur_results_path.exists(): cur_results_path.mkdir(parents=True) for key, item in r_dict.items(): common.save_jsonl(item['predictions'], cur_results_path / f"{key}.jsonl") # avoid saving too many things for key, item in r_dict.items(): del r_dict[key]['predictions'] common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2)
def eval_model_for_downstream(model_saved_path): seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True forward_size = 32 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") test_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # dev_instances = bert_cs_reader.read(dev_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) test_instances = bert_cs_reader.read(test_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # train_iter = biterator(train_instance, num_epochs=1, shuffle=False) # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(dev_fitems_list)) print(len(test_fitems_list)) print(len(train_fitems_list)) # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl") print("Test write finished.") exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_3 cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3) upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top3, dev_list) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False) _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) logging_item = { 'top3': metrics_top3, 'top3_UB': metrics_top3_UB, 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } print(logging_item) common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl") common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
def analysis_model(model_path): batch_size = 32 lazy = True train_prob_threshold = 0.02 train_sample_top_k = 8 dev_prob_threshold = 0.1 dev_sample_top_k = 5 neg_sample_upper_prob = 0.006 decay_r = 0.002 top_k_doc = 5 dev_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_doc_upstream_file, pred=True, top_k=top_k_doc) print("Dev size:", len(complete_upstream_dev_data)) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } # Data Reader dev_fever_data_reader = VCSS_Reader(token_indexers=token_indexers, lazy=lazy, max_l=260) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.add_token_to_namespace('true', namespace='labels') vocab.add_token_to_namespace('false', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Reader and prepare end # vc_ss_training_sampler = VCSSTrainingSampler(complete_upstream_train_data) # vc_ss_training_sampler.show_info() # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + 1, 1024 + 450 + 1), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.display() model.to(device) cloned_empty_model = copy.deepcopy(model) load_ema_to_model(cloned_empty_model, model_path) vc_ss.data_wrangler.assign_task_label(complete_upstream_dev_data, 'ss') dev_ss_instance = dev_fever_data_reader.read(complete_upstream_dev_data) eval_ss_iter = biterator(dev_ss_instance, num_epochs=1, shuffle=False) scored_dev_sent_data = hidden_eval_ss(cloned_empty_model, eval_ss_iter, complete_upstream_dev_data) common.save_jsonl(scored_dev_sent_data, "dev_scored_sent_data.jsonl") # for vc filtered_dev_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei( config.T_FEVER_DEV_JSONL, scored_dev_sent_data, dev_prob_threshold, dev_sample_top_k) common.save_jsonl(filtered_dev_list, "dev_scored_sent_data_after_sample.jsonl") dev_selection_dict = paired_selection_score_dict(scored_dev_sent_data) ready_dev_list = select_sent_with_prob_for_eval(config.T_FEVER_DEV_JSONL, filtered_dev_list, dev_selection_dict, tokenized=True) vc_ss.data_wrangler.assign_task_label(ready_dev_list, 'vc') dev_vc_instance = dev_fever_data_reader.read(ready_dev_list) eval_vc_iter = biterator(dev_vc_instance, num_epochs=1, shuffle=False) eval_dev_result_list = hidden_eval_vc(cloned_empty_model, eval_vc_iter, ready_dev_list) common.save_jsonl(eval_dev_result_list, "dev_nli_results.jsonl") # Scoring eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( eval_dev_result_list, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) print(f"Dev:{strict_score}/{acc_score}")
def hidden_eval_fever_adv_v1(): batch_size = 64 lazy = True dev_prob_threshold = 0.5 SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-20-22:28:24_mesim_wn_450_adv_sample_v1_|t_prob:0.35|top_k:8/i(46000)_epoch(7)_dev(0.6405140514051405)_loss(1.0761665150348825)_seed(12)" dev_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/2018_07_20_15:17:59_r/dev_sent.jsonl") # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_upstream_sent_list, scale_prob=dev_prob_threshold, delete_prob=False) dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model( rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} common.save_jsonl( builded_dev_data, config.RESULT_PATH / "nli_results" / "pipeline_results_1.jsonl") c_scorer.delete_label(builded_dev_data) print( c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.FEVER_DEV_JSONL), mode=eval_mode))
def merge_results_with_haonao_module(term_retrieval_top_k=3, match_filtering_k=2, haonan_topk=10, tag='dev', save=False): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) task_name = 'shared_task_dev' elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) task_name = 'train' elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) task_name = 'shared_task_test' else: raise ValueError(f"Tag:{tag} not supported.") # r_list = common.load_jsonl(config.RESULT_PATH / f'doc_retri_results/fever_results/standard_term_based_results/' # f'fever_term_based_retri_results_{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl') r_list = common.load_jsonl( config.RESULT_PATH / f'doc_retri_results/fever_results/standard_term_based_results/' f'fever_term_based_retri_results_{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) old_result_list = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/fever_results/haonans_results/dr_{tag}.jsonl") item_resorting(old_result_list, top_k=haonan_topk) old_result_dict = list_dict_data_tool.list_to_dict(old_result_list, 'id') for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # item['predicted_docids'] = [] merged_result_list = [] for item in tqdm(r_list): cur_id = int(item['id']) old_retrieval_doc = old_result_dict[cur_id]['predicted_docids'] new_retrieval_doc = item['predicted_docids'] m_predicted_docids = set.union(set(old_retrieval_doc), set(new_retrieval_doc)) # print(m_predicted_docids) m_predicted_docids = [ docid for docid in m_predicted_docids if not docid.startswith('List_of_') ] item['predicted_docids'] = list(m_predicted_docids) # print(item['predicted_docids']) mode = {'standard': False, 'check_doc_id_correct': True} if tag != 'test': fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) if save: print("Saved to:") common.save_jsonl( r_list, config.RESULT_PATH / f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl" ) # States information. len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list))
del item['evidence'] if __name__ == '__main__': # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15-11-11_r/dev_sent.jsonl" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15-52-19_r/dev_sent.jsonl" IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_20_15-17-59_r/dev_sent.jsonl" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_20_15-17-59_r/train_sent.jsonl" dev_sent_result_lsit = common.load_jsonl(IN_FILE) dev_results_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=0.1) # dev_results_list = score_converter_scaled(config.T_FEVER_TRAIN_JSONL, dev_sent_result_lsit, scale_prob=0.1) common.save_jsonl( dev_results_list, config.RESULT_PATH / "sent_retri_nn/2018_07_20_15-17-59_r/dev_scale(0.1).jsonl") # for item in dev_results_list: # print(item['scored_sentids']) # common.save_jsonl(dev_results_list, "/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") # common.save_jsonl(dev_results_list, "/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") # eval_mode = {'check_doc_id_correct': True, 'check_sent_id_correct': True, 'standard': True} eval_mode = {'check_sent_id_correct': True, 'standard': True} # c_scorer.delete_label(dev_results_list) strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, common.load_jsonl(config.FEVER_DEV_UNLABELED_JSONL), mode=eval_mode,
def eval_m_on_sselection(model_path): # This method is created on 25 Nov 2018 09:32 to use the claim verifier model to do scoring for sentence selection. batch_size = 32 lazy = True top_k_doc = 5 save_file_name = "/home/easonnie/projects/FunEver/results/sent_retri_nn/bert_verification_for_selection_probing_11_25_2018/dev_sent_scores.txt" dev_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True, top_k=top_k_doc) debug = None bert_type_name = "bert-large-uncased" bert_servant = BertServant(bert_type_name=bert_type_name) # train_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl" # train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False) dev_fever_data_reader = BertSSReader(bert_servant, lazy=lazy, max_l=80) print("Dev size:", len(complete_upstream_dev_data)) # dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) if debug is not None: complete_upstream_dev_data = complete_upstream_dev_data[:debug] dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) unk_token_num = {'tokens': 2600} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace('true', namespace='labels') vocab.add_token_to_namespace('false', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') print(vocab) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 bert_servant.bert_model.to(device) # Init model here model = Model( bert_servant, bert_batch_size=1, rnn_size_in=(1024 + 2, 1024 + 2 + 300), # probs + task indicator. rnn_size_out=(300, 300), max_l=250, mlp_d=300, num_of_class=3, drop_r=0.5, activation_type='gelu') model.load_state_dict(torch.load(model_path)) model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1) dev_scored_data = hidden_eval_on_sselection(model, eval_iter, complete_upstream_dev_data) common.save_jsonl(dev_scored_data, save_file_name)
def eval_and_save_v2(model_path, is_ema, saving_dir, save_train_data=True, prob_thresholds=0.5): # This method was modified on 21 NOV 2018 # for evaluating balanced trained selection model with different threshold value. # It will then be used for later verification. # Evaluate and Save all the sentence pairs results to be used for downstream verificaion # 03 Oct 2018 03:56:40. seed = 12 batch_size = 128 lazy = True torch.manual_seed(seed) keep_neg_sample_prob = 1 top_k_doc = 5 # sample_prob_decay = 0.05 dev_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" train_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True, top_k=top_k_doc) complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False, top_k=top_k_doc) print("Dev size:", len(complete_upstream_dev_data)) print("Train size:", len(complete_upstream_train_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) train_instances = train_fever_data_reader.read( complete_upstream_train_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=160, num_of_class=2) if not is_ema: model.load_state_dict(torch.load(model_path)) else: load_ema_to_model(model, model_path) model.display() model.to(device) dev_actual_list = common.load_jsonl(config.T_FEVER_DEV_JSONL) train_actual_list = common.load_jsonl(config.T_FEVER_TRAIN_JSONL) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1) train_iter = biterator(train_instances, shuffle=False, num_epochs=1) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) if save_train_data: complete_upstream_train_data = hidden_eval( model, train_iter, complete_upstream_train_data) common.save_jsonl(complete_upstream_train_data, Path(str(saving_dir)) / "train_sent_scores.jsonl") common.save_jsonl(complete_upstream_dev_data, Path(str(saving_dir)) / "dev_sent_pred_scores.jsonl") if not isinstance(prob_thresholds, list): prob_thresholds = [prob_thresholds] for scal_prob in prob_thresholds: print("Eval Dev Data prob_threshold:", scal_prob) dev_results_list = score_converter_v1(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data, sent_retri_top_k=5, sent_retri_scal_prob=scal_prob) # This is only a wrapper for the simi_sampler eval_mode = {'check_sent_id_correct': True, 'standard': True} for a, b in zip(dev_actual_list, dev_results_list): b['predicted_label'] = a['label'] strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, dev_actual_list, mode=eval_mode, verbose=False) tracking_score = strict_score print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") if save_train_data: print("Build Train Data") train_results_list = score_converter_v1( config.T_FEVER_TRAIN_JSONL, complete_upstream_train_data, sent_retri_top_k=5, sent_retri_scal_prob=prob_threshold) # This is only a wrapper for the simi_sampler eval_mode = {'check_sent_id_correct': True, 'standard': True} for a, b in zip(train_actual_list, train_results_list): b['predicted_label'] = a['label'] strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( train_results_list, train_actual_list, mode=eval_mode, verbose=False) tracking_score = strict_score print(f"Train(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}")
def train(local_rank, args): # debug = False # print("GPU:", gpu) # world_size = args.world_size args.global_rank = args.node_rank * args.gpus_per_node + local_rank args.local_rank = local_rank # args.warmup_steps = 20 debug_count = 1000 num_epoch = args.epochs actual_train_batch_size = args.world_size * args.per_gpu_train_batch_size * args.gradient_accumulation_steps args.actual_train_batch_size = actual_train_batch_size set_seed(args.seed) num_labels = 3 # we are doing NLI so we set num_labels = 3, for other task we can change this value. max_length = args.max_length model_class_item = MODEL_CLASSES[args.model_class_name] model_name = model_class_item['model_name'] do_lower_case = model_class_item[ 'do_lower_case'] if 'do_lower_case' in model_class_item else False tokenizer = model_class_item['tokenizer'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), do_lower_case=do_lower_case) model = model_class_item['sequence_classification'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), num_labels=num_labels) padding_token_value = tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0] padding_segement_value = model_class_item["padding_segement_value"] padding_att_value = model_class_item["padding_att_value"] left_pad = model_class_item[ 'left_pad'] if 'left_pad' in model_class_item else False batch_size_per_gpu_train = args.per_gpu_train_batch_size batch_size_per_gpu_eval = args.per_gpu_eval_batch_size if not args.cpu and not args.single_gpu: dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=args.global_rank) train_data_str = args.train_data train_data_weights_str = args.train_weights eval_data_str = args.eval_data train_data_name = [] train_data_path = [] train_data_list = [] train_data_weights = [] eval_data_name = [] eval_data_path = [] eval_data_list = [] train_data_named_path = train_data_str.split(',') weights_str = train_data_weights_str.split( ',') if train_data_weights_str is not None else None eval_data_named_path = eval_data_str.split(',') for named_path in train_data_named_path: ind = named_path.find(':') name = named_path[:ind] path = name[ind + 1:] if name in registered_path: d_list = common.load_jsonl(registered_path[name]) else: d_list = common.load_jsonl(path) train_data_name.append(name) train_data_path.append(path) train_data_list.append(d_list) if weights_str is not None: for weights in weights_str: train_data_weights.append(float(weights)) else: for i in range(len(train_data_list)): train_data_weights.append(1) for named_path in eval_data_named_path: ind = named_path.find(':') name = named_path[:ind] path = name[ind + 1:] if name in registered_path: d_list = common.load_jsonl(registered_path[name]) else: d_list = common.load_jsonl(path) eval_data_name.append(name) eval_data_path.append(path) eval_data_list.append(d_list) assert len(train_data_weights) == len(train_data_list) batching_schema = { 'uid': RawFlintField(), 'y': LabelFlintField(), 'input_ids': ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad), 'token_type_ids': ArrayIndexFlintField(pad_idx=padding_segement_value, left_pad=left_pad), 'attention_mask': ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad), } data_transformer = NLITransform(model_name, tokenizer, max_length) # data_transformer = NLITransform(model_name, tokenizer, max_length, with_element=True) eval_data_loaders = [] for eval_d_list in eval_data_list: d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler( eval_d_list, data_transformer, batching_schema, batch_size_per_gpu_eval) eval_data_loaders.append(d_dataloader) # Estimate the training size: training_list = [] for i in range(len(train_data_list)): print("Build Training Data ...") train_d_list = train_data_list[i] train_d_name = train_data_name[i] train_d_weight = train_data_weights[i] cur_train_list = sample_data_list( train_d_list, train_d_weight ) # change later # we can apply different sample strategy here. print( f"Data Name:{train_d_name}; Weight: {train_d_weight}; " f"Original Size: {len(train_d_list)}; Sampled Size: {len(cur_train_list)}" ) training_list.extend(cur_train_list) estimated_training_size = len(training_list) print("Estimated training size:", estimated_training_size) # Estimate the training size ends: # t_total = estimated_training_size // args.gradient_accumulation_steps * num_epoch t_total = estimated_training_size * num_epoch // args.actual_train_batch_size if args.warmup_steps <= 0: # set the warmup steps to 0.1 * total step if the given warmup step is -1. args.warmup_steps = int(t_total * 0.1) if not args.cpu: torch.cuda.set_device(args.local_rank) model.cuda(args.local_rank) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if not args.cpu and not args.single_gpu: model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) args_dict = dict(vars(args)) file_path_prefix = '.' if args.global_rank in [-1, 0]: print("Total Steps:", t_total) args.total_step = t_total print("Warmup Steps:", args.warmup_steps) print("Actual Training Batch Size:", actual_train_batch_size) print("Arguments", pp.pprint(args)) # Let build the logger and log everything before the start of the first training epoch. if args.global_rank in [ -1, 0 ]: # only do logging if we use cpu or global_rank=0 if not args.debug_mode: file_path_prefix, date = save_tool.gen_file_prefix( f"{args.experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save option file common.save_json(args_dict, os.path.join(file_path_prefix, "args.json")) checkpoints_path = Path(file_path_prefix) / "checkpoints" if not checkpoints_path.exists(): checkpoints_path.mkdir() prediction_path = Path(file_path_prefix) / "predictions" if not prediction_path.exists(): prediction_path.mkdir() global_step = 0 # print(f"Global Rank:{args.global_rank} ### ", 'Init!') for epoch in tqdm(range(num_epoch), desc="Epoch", disable=args.global_rank not in [-1, 0]): # Let's build up training dataset for this epoch training_list = [] for i in range(len(train_data_list)): print("Build Training Data ...") train_d_list = train_data_list[i] train_d_name = train_data_name[i] train_d_weight = train_data_weights[i] cur_train_list = sample_data_list( train_d_list, train_d_weight ) # change later # we can apply different sample strategy here. print( f"Data Name:{train_d_name}; Weight: {train_d_weight}; " f"Original Size: {len(train_d_list)}; Sampled Size: {len(cur_train_list)}" ) training_list.extend(cur_train_list) random.shuffle(training_list) train_dataset = NLIDataset(training_list, data_transformer) train_sampler = SequentialSampler(train_dataset) if not args.cpu and not args.single_gpu: print("Use distributed sampler.") train_sampler = DistributedSampler(train_dataset, args.world_size, args.global_rank, shuffle=True) train_dataloader = DataLoader( dataset=train_dataset, batch_size=batch_size_per_gpu_train, shuffle=False, # num_workers=0, pin_memory=True, sampler=train_sampler, collate_fn=BaseBatchBuilder(batching_schema)) # # training build finished. print(debug_node_info(args), "epoch: ", epoch) if not args.cpu and not args.single_gpu: train_sampler.set_epoch( epoch ) # setup the epoch to ensure random sampling at each epoch for forward_step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.global_rank not in [-1, 0]), 0): model.train() batch = move_to_device(batch, local_rank) # print(batch['input_ids'], batch['y']) if args.model_class_name in ["distilbert", "bart-large"]: outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['y']) else: outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'], token_type_ids=batch['token_type_ids'], labels=batch['y']) loss, logits = outputs[:2] # print(debug_node_info(args), loss, logits, batch['uid']) # print(debug_node_info(args), loss, batch['uid']) # Accumulated loss if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # if this forward step need model updates # handle fp16 if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Gradient clip: if max_grad_norm < 0 if (forward_step + 1) % args.gradient_accumulation_steps == 0: if args.max_grad_norm > 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.global_rank in [ -1, 0 ] and args.eval_frequency > 0 and global_step % args.eval_frequency == 0: r_dict = dict() # Eval loop: for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] cur_eval_data_list = eval_data_list[i] cur_eval_dataloader = eval_data_loaders[i] # cur_eval_raw_data_list = eval_raw_data_list[i] evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict, eval_name=cur_eval_data_name) # saving checkpoints current_checkpoint_filename = \ f'e({epoch})|i({global_step})' for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] current_checkpoint_filename += \ f'|{cur_eval_data_name}#({round(r_dict[cur_eval_data_name]["acc"], 4)})' if not args.debug_mode: # save model: model_output_dir = checkpoints_path / current_checkpoint_filename if not model_output_dir.exists(): model_output_dir.mkdir() model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), str(model_output_dir / "model.pt")) torch.save(optimizer.state_dict(), str(model_output_dir / "optimizer.pt")) torch.save(scheduler.state_dict(), str(model_output_dir / "scheduler.pt")) # save prediction: if not args.debug_mode and args.save_prediction: cur_results_path = prediction_path / current_checkpoint_filename if not cur_results_path.exists(): cur_results_path.mkdir(parents=True) for key, item in r_dict.items(): common.save_jsonl( item['predictions'], cur_results_path / f"{key}.jsonl") # avoid saving too many things for key, item in r_dict.items(): del r_dict[key]['predictions'] common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2) # End of epoch evaluation. if args.global_rank in [-1, 0]: r_dict = dict() # Eval loop: for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] cur_eval_data_list = eval_data_list[i] cur_eval_dataloader = eval_data_loaders[i] # cur_eval_raw_data_list = eval_raw_data_list[i] evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict, eval_name=cur_eval_data_name) # saving checkpoints current_checkpoint_filename = \ f'e({epoch})|i({global_step})' for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] current_checkpoint_filename += \ f'|{cur_eval_data_name}#({round(r_dict[cur_eval_data_name]["acc"], 4)})' if not args.debug_mode: # save model: model_output_dir = checkpoints_path / current_checkpoint_filename if not model_output_dir.exists(): model_output_dir.mkdir() model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), str(model_output_dir / "model.pt")) torch.save(optimizer.state_dict(), str(model_output_dir / "optimizer.pt")) torch.save(scheduler.state_dict(), str(model_output_dir / "scheduler.pt")) # save prediction: if not args.debug_mode and args.save_prediction: cur_results_path = prediction_path / current_checkpoint_filename if not cur_results_path.exists(): cur_results_path.mkdir(parents=True) for key, item in r_dict.items(): common.save_jsonl(item['predictions'], cur_results_path / f"{key}.jsonl") # avoid saving too many things for key, item in r_dict.items(): del r_dict[key]['predictions'] common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2)
def eval_model_for_downstream_ablation(model_saved_path, doc_top_k=2, tag='dev'): print(f"Run doc_top_k:{doc_top_k}") bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True # forward_size = 256 forward_size = 256 # batch_size = 64 batch_size = 128 do_lower_case = True document_top_k = doc_top_k debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) test_list = common.load_json(config.TEST_FULLWIKI_FILE) # Load train eval results list # cur_train_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl") cur_dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) # cur_test_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl") # if tag == 'train': # train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True, # debug_mode=debug_mode) if tag == 'dev': dev_fitems = get_sentence_pair(document_top_k, dev_list, cur_dev_eval_results_list, is_training=False, debug_mode=debug_mode) # elif tag == 'test': # test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False, # debug_mode=debug_mode) if debug_mode: eval_frequency = 2 # dev_list = dev_list[:10] # dev_fitems_list = dev_fitems_list[:296] # train_fitems_list = train_fitems_list[:300] # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if tag == 'train': train_instance = bert_cs_reader.read(train_fitems) elif tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'train': train_iter = biterator(train_instance, num_epochs=1, shuffle=False) print(len(train_fitems)) elif tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) print(len(dev_fitems)) elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(test_fitems)) print("Forward size:", forward_size) if tag == 'train': cur_train_eval_results_list_out = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_train_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) elif tag == 'dev': cur_dev_eval_results_list_out = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_dev_eval_results_list_out, f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl") elif tag == 'test': cur_test_eval_results_list_out = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_test_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl" ) if tag == 'train' or tag == 'test': exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] common.save_json( logging_item, f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
def model_go_with_old_data(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'fever_v1_nli' lazy = False # lazy = True forward_size = 16 # batch_size = 64 # batch_size = 192 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 3 eval_frequency = 2000 do_lower_case = True pair_order = 'cq' # debug_mode = True debug_mode = False # est_datasize = 900_000 num_class = 3 # num_train_optimization_steps train_sent_filtering_prob = 0.35 dev_sent_filtering_prob = 0.1 # dev_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json" # train_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl" from utest.utest_format_converter_for_old_sent.tool import format_convert dev_sent_results_file = format_convert( config.PRO_ROOT / "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/4-15-dev_sent_pred_scores_old_format.jsonl" ) train_sent_results_file = format_convert( config.PRO_ROOT / "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/train_sent_scores_old_format.jsonl" ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_fitems_list = get_inference_pair('train', True, train_sent_results_file, debug_mode, train_sent_filtering_prob) dev_debug_num = 2481 if debug_mode else None dev_fitems_list, dev_list = get_inference_pair('dev', False, dev_sent_results_file, dev_debug_num, dev_sent_filtering_prob) # = common.load_jsonl(config.FEVER_DEV) if debug_mode: dev_list = dev_list[:50] eval_frequency = 1 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_debug_num = 2971 if debug_mode else None train_fitems_list, _ = get_inference_pair('train', True, train_sent_results_file, train_debug_num, train_sent_filtering_prob) est_datasize = len(train_fitems_list) # dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=364, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) train_fitems_list, _ = get_inference_pair('train', True, train_sent_results_file, train_debug_num, train_sent_filtering_prob) random.shuffle(train_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True) results_dict = list_dict_data_tool.list_to_dict( cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list( copied_dev_list, results_dict, 'id', 'predicted_label') mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score( copied_dev_list, dev_fitems_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } save_file_name = f'i({update_step})|e({epoch_i})' \ f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ f'|seed({seed})' common.save_jsonl( copied_dev_list, Path(file_path_prefix) / f"{save_file_name}_dev_nli_results.json") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def eval_model_for_downstream(model_saved_path): bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 64 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False max_l = 264 # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps tag = 'test' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_ruleterm_doc_results = common.load_jsonl( # config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl") # dev_ruleterm_doc_results = train_ruleterm_doc_results if tag == 'dev': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl" ) dev_list = common.load_jsonl(config.FEVER_DEV) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'dev', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) elif tag == 'train': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl" ) dev_list = common.load_jsonl(config.FEVER_TRAIN) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'train', dev_ruleterm_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=False) elif tag == 'test': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_test.jsonl" ) dev_list = common.load_jsonl(config.FEVER_TEST) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'test', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) else: raise NotImplemented() # dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair('train', dev_ruleterm_doc_results, # is_training=True, debug=debug_mode, # ignore_non_verifiable=False) # Just to show the information fever_p_level_sampler.down_sample_neg(dev_fitems, None) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if debug_mode: num_train_optimization_steps = 100 dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_p_level_{tag}_results.jsonl") if tag == 'test': exit(0) # common.save_jsonl(cur_eval_results_list, "fever_p_level_train_results_1.jsonl") copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.1) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_01 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th00_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.01) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th00_1, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_001 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th000_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.005) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th000_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_0005 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_0005': score_0005, 'score_001': score_001, 'score_01': score_01, 'score_02': score_02, 'score_05': score_05, } print(json.dumps(logging_item, indent=2))