def p_eval(): dev_list = common.load_jsonl(config.FEVER_DEV) # common.save_jsonl(cur_eval_results_list, f"fever_p_level_{tag}_results.jsonl") cur_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl" ) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.005) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(score_05)
def get_nli_pair(tag, is_training, sent_level_results_list, debug=None, sent_top_k=5, sent_filter_value=0.05): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] # sent_dict = list_dict_data_tool.list_to_dict(sent_level_results_list): d_dict = list_dict_data_tool.list_to_dict(d_list, 'id') if debug: id_set = set([item['id'] for item in d_list]) new_sent_list = [] for item in sent_level_results_list: if item["qid"] in id_set: new_sent_list.append(item) sent_level_results_list = new_sent_list list_dict_data_tool.append_subfield_from_list_to_dict( sent_level_results_list, d_dict, 'qid', 'fid', check=True) filltered_sent_dict = select_top_k_and_to_results_dict( d_dict, score_field_name='prob', top_k=sent_top_k, filter_value=sent_filter_value, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( d_list, filltered_sent_dict, 'id', ['predicted_evidence', 'selected_scored_results']) fever_db_cursor = fever_db.get_cursor(config.FEVER_DB) forward_items = build_nli_forward_item(d_list, is_training=is_training, db_cursor=fever_db_cursor) return forward_items, d_list
def get_sentence_forward_pair(tag, ruleterm_doc_results, is_training, debug=False, ignore_non_verifiable=False, top_k=5, filter_value=0.005): if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") if debug: d_list = d_list[:100] ruleterm_doc_results = ruleterm_doc_results[:100] # ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(ruleterm_doc_results, 'id') d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'id') copied_d_o_dict = copy.deepcopy(d_o_dict) # copied_d_list = copy.deepcopy(d_list) list_dict_data_tool.append_subfield_from_list_to_dict(ruleterm_doc_results, copied_d_o_dict, 'qid', 'fid', check=True) cur_results_dict_filtered = select_top_k_and_to_results_dict( copied_d_o_dict, score_field_name='prob', top_k=top_k, filter_value=filter_value) db_cursor = fever_db.get_cursor() fitems = build_full_wiki_sentence_forward_item(cur_results_dict_filtered, d_list, is_training, db_cursor, ignore_non_verifiable) return fitems
def eval_model_for_downstream(model_saved_path): bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 64 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False max_l = 264 # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps tag = 'test' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset # train_ruleterm_doc_results = common.load_jsonl( # config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl") # dev_ruleterm_doc_results = train_ruleterm_doc_results if tag == 'dev': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl" ) dev_list = common.load_jsonl(config.FEVER_DEV) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'dev', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) elif tag == 'train': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl" ) dev_list = common.load_jsonl(config.FEVER_TRAIN) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'train', dev_ruleterm_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=False) elif tag == 'test': dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_test.jsonl" ) dev_list = common.load_jsonl(config.FEVER_TEST) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'test', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) else: raise NotImplemented() # dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair('train', dev_ruleterm_doc_results, # is_training=True, debug=debug_mode, # ignore_non_verifiable=False) # Just to show the information fever_p_level_sampler.down_sample_neg(dev_fitems, None) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if debug_mode: num_train_optimization_steps = 100 dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_p_level_{tag}_results.jsonl") if tag == 'test': exit(0) # common.save_jsonl(cur_eval_results_list, "fever_p_level_train_results_1.jsonl") copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.1) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_01 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th00_1 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.01) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th00_1, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_001 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th000_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.005) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th000_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_0005 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_0005': score_0005, 'score_001': score_001, 'score_01': score_01, 'score_02': score_02, 'score_05': score_05, } print(json.dumps(logging_item, indent=2))
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' lazy = False # lazy = True forward_size = 64 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 5000 do_lower_case = True ignore_non_verifiable = True experiment_name = f'fever_v0_plevel_retri_(ignore_non_verifiable:{ignore_non_verifiable})' debug_mode = False max_l = 264 # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl" ) dev_ruleterm_doc_results = common.load_jsonl( config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl" ) # train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_jsonl(config.FEVER_DEV) train_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'train', train_ruleterm_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=True) dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair( 'dev', dev_ruleterm_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False) # Just to show the information fever_p_level_sampler.down_sample_neg(train_fitems, None) fever_p_level_sampler.down_sample_neg(dev_fitems, None) if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(train_fitems) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) if not debug_mode: # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix( f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) random.shuffle(train_fitems) train_instance = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model( paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only( copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'score_02': score_02, 'score_05': score_05, } print(logging_item) s02_ss_score = score_02['ss'] s05_ss_score = score_05['ss'] if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v02_ofever({s02_ss_score})' \ f'|v05_ofever({s05_ss_score})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file( Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr( model, 'module') else model output_model_file = Path( file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def eval_fever_procedure(biterator, dev_instances, model, device_num, ema_device_num, dev_list, dev_o_dict, debug_mode, logging_agent, update_step, epoch_i, file_path_prefix, do_ema, ema, seed): print("Eval FEVER!") dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = fever_sampler_utils.select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = fever_sampler_utils.select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'step:': update_step, 'epoch': epoch_i, 'score_02': score_02, 'score_05': score_05, 'time': str(datetime.datetime.now()) } print(logging_item) s02_ss_score = score_02['ss'] s05_ss_score = score_05['ss'] if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v02_ofever({s02_ss_score})' \ f'|v05_ofever({s05_ss_score})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: ema_model = ema.get_inference_model() master_device_num = ema_device_num ema_inference_device_ids = get_ema_gpu_id_list( master_device_num=master_device_num) ema_model = ema_model.to(master_device_num) ema_model = torch.nn.DataParallel(ema_model, device_ids=ema_inference_device_ids) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(ema_model, dev_iter, master_device_num, make_int=True, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = fever_sampler_utils.select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_doc_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_2 = fever_sampler_utils.select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2) list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids') strict_score, pr, rec, f1 = fever_scorer.fever_doc_only( copied_dev_d_list, dev_list, max_evidence=5) score_02 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } logging_item = { 'label': 'ema', 'step:': update_step, 'epoch': epoch_i, 'score_02': score_02, 'score_05': score_05, 'time': str(datetime.datetime.now()) } print(logging_item) s02_ss_score = score_02['ss'] s05_ss_score = score_05['ss'] if not debug_mode: save_file_name = f'i({update_step})|e({epoch_i})' \ f'|v02_ofever({s02_ss_score})' \ f'|v05_ofever({s05_ss_score})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr( ema_model, 'module') else ema_model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def eval_model_for_downstream_ablation(model_saved_path, top_k_doc): bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 128 # batch_size = 64 # batch_size = 128 do_lower_case = True debug_mode = False max_l = 128 # est_datasize = 900_000 tag = 'dev' num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl" ) dev_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl" ) test_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_test_results.jsonl" ) train_list = common.load_jsonl(config.FEVER_TRAIN) dev_list = common.load_jsonl(config.FEVER_DEV) test_list = common.load_jsonl(config.FEVER_TEST) # dev_list = common.load_jsonl(config.FEVER_DEV) if tag == 'dev': dev_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'dev', dev_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(dev_fitems, None) elif tag == 'train': train_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'train', train_upstream_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(train_fitems, None) elif tag == 'test': test_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'test', test_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(test_fitems, None) # Just to show the information if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') test_o_dict = list_dict_data_tool.list_to_dict(test_list, 'id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl( cur_eval_results_list, f"fever_s_level_{tag}_results_top_k_doc_{top_k_doc}.jsonl") copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'top_k_doc': top_k_doc, 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print("Top_k doc:", top_k_doc) print(score_05) common.save_json( score_05, f"top_k_doc:{top_k_doc}_ss:{strict_score}_pr:{pr}_rec:{rec}_f1:{f1}" ) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_s_level_{tag}_results.jsonl") # copied_test_o_dict = copy.deepcopy(test_o_dict) # copied_test_d_list = copy.deepcopy(test_list) # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_test_o_dict, # 'qid', 'fid', check=True) # # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_test_o_dict, # score_field_name='prob', # top_k=5, filter_value=0.5, # result_field='predicted_evidence') # # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_test_d_list, # cur_results_dict_th0_5, # 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} # copied_train_o_dict = copy.deepcopy(train_o_dict) # copied_train_d_list = copy.deepcopy(train_list) # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_train_o_dict, # 'qid', 'fid', check=True) # # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_train_o_dict, # score_field_name='prob', # top_k=5, filter_value=0.5, # result_field='predicted_evidence') # # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_train_d_list, # cur_results_dict_th0_5, # 'id', 'predicted_evidence') # # mode = {'standard': False, 'check_doc_id_correct': True} # strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(copied_train_d_list, train_list, # max_evidence=5) # score_05 = { # 'ss': strict_score, # 'pr': pr, 'rec': rec, 'f1': f1, # } # # print(score_05) elif tag == 'train': train_instances = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, train_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_s_level_{tag}_results.jsonl") copied_train_o_dict = copy.deepcopy(train_o_dict) copied_train_d_list = copy.deepcopy(train_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_train_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_train_o_dict, score_field_name='prob', top_k=5, filter_value=0.5, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_train_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_train_d_list, train_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(score_05)