def experiment_test_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 data_list = common.load_json(config.TEST_FULLWIKI_FILE) terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl" ) g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # WE need to give gt data None. doc_retri_pred_dict = init_results_v8( data_list, None, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) common.save_json( doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) common.save_json(new_doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8.json")
def logging_to_file(self, filename): if Path(filename).is_file(): old_logging_list = common.load_json(filename) current_saved_key = set() for item in self.logging_item_list: current_saved_key.add(item['k']) for item in old_logging_list: if item['k'] not in current_saved_key: raise ValueError("Previous logged item can not be found!") common.save_json(self.logging_item_list, filename, indent=2, sort_keys=True)
def build_clean_lemma2tags(conceptnet_en_path, dump_path, num_parallels=20): # build a dict: clean_lemma str 2 list of tagger def _process(_conceptnet_id_list): nlp = spacy.load("en", disable=["parser", "ner", "textcat"]) new_lemma_conceptnetid = collections.defaultdict(list) for _concept in tqdm(_conceptnet_id_list): _proc_concept = _concept.split("/")[3] _proc_concept = " ".join(_proc_concept.split("_")) doc = nlp(_proc_concept) _proc_concept = " ".join([token.lemma_ for token in doc]) _clean_concept = clean_phrase(_proc_concept) if _clean_concept not in new_lemma_conceptnetid: _attr_list = [[ token.tag_, ] for token in nlp(_clean_concept)] new_lemma_conceptnetid[_clean_concept] = _attr_list return new_lemma_conceptnetid concept_set = set() for _row in conceptnet_dump_iter(conceptnet_en_path): for _concept in _row[2:4]: if _concept not in concept_set: concept_set.add(_concept) concept_list = list(concept_set) multi_dict = multiprocessing_map( func=_process, dict_args_list=[{ "_conceptnet_id_list": _data } for _data in split_to_lists(concept_list, num_parallels)], num_parallels=num_parallels) final_dict = {} for _dict in multi_dict: final_dict.update(_dict) save_json(final_dict, dump_path)
def train(local_rank, args): # debug = False # print("GPU:", gpu) # world_size = args.world_size args.global_rank = args.node_rank * args.gpus_per_node + local_rank args.local_rank = local_rank # args.warmup_steps = 20 debug_count = 1000 num_epoch = args.epochs actual_train_batch_size = args.world_size * args.per_gpu_train_batch_size * args.gradient_accumulation_steps args.actual_train_batch_size = actual_train_batch_size set_seed(args.seed) num_labels = 3 # we are doing NLI so we set num_labels = 3, for other task we can change this value. max_length = args.max_length model_class_item = MODEL_CLASSES[args.model_class_name] model_name = model_class_item['model_name'] do_lower_case = model_class_item[ 'do_lower_case'] if 'do_lower_case' in model_class_item else False tokenizer = model_class_item['tokenizer'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), do_lower_case=do_lower_case) model = model_class_item['sequence_classification'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), num_labels=num_labels) padding_token_value = tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0] padding_segement_value = model_class_item["padding_segement_value"] padding_att_value = model_class_item["padding_att_value"] left_pad = model_class_item[ 'left_pad'] if 'left_pad' in model_class_item else False batch_size_per_gpu_train = args.per_gpu_train_batch_size batch_size_per_gpu_eval = args.per_gpu_eval_batch_size if not args.cpu and not args.single_gpu: dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=args.global_rank) train_data_str = args.train_data train_data_weights_str = args.train_weights eval_data_str = args.eval_data train_data_name = [] train_data_path = [] train_data_list = [] train_data_weights = [] eval_data_name = [] eval_data_path = [] eval_data_list = [] train_data_named_path = train_data_str.split(',') weights_str = train_data_weights_str.split( ',') if train_data_weights_str is not None else None eval_data_named_path = eval_data_str.split(',') for named_path in train_data_named_path: ind = named_path.find(':') name = named_path[:ind] path = name[ind + 1:] if name in registered_path: d_list = common.load_jsonl(registered_path[name]) else: d_list = common.load_jsonl(path) train_data_name.append(name) train_data_path.append(path) train_data_list.append(d_list) if weights_str is not None: for weights in weights_str: train_data_weights.append(float(weights)) else: for i in range(len(train_data_list)): train_data_weights.append(1) for named_path in eval_data_named_path: ind = named_path.find(':') name = named_path[:ind] path = name[ind + 1:] if name in registered_path: d_list = common.load_jsonl(registered_path[name]) else: d_list = common.load_jsonl(path) eval_data_name.append(name) eval_data_path.append(path) eval_data_list.append(d_list) assert len(train_data_weights) == len(train_data_list) batching_schema = { 'uid': RawFlintField(), 'y': LabelFlintField(), 'input_ids': ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad), 'token_type_ids': ArrayIndexFlintField(pad_idx=padding_segement_value, left_pad=left_pad), 'attention_mask': ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad), } data_transformer = NLITransform(model_name, tokenizer, max_length) # data_transformer = NLITransform(model_name, tokenizer, max_length, with_element=True) eval_data_loaders = [] for eval_d_list in eval_data_list: d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler( eval_d_list, data_transformer, batching_schema, batch_size_per_gpu_eval) eval_data_loaders.append(d_dataloader) # Estimate the training size: training_list = [] for i in range(len(train_data_list)): print("Build Training Data ...") train_d_list = train_data_list[i] train_d_name = train_data_name[i] train_d_weight = train_data_weights[i] cur_train_list = sample_data_list( train_d_list, train_d_weight ) # change later # we can apply different sample strategy here. print( f"Data Name:{train_d_name}; Weight: {train_d_weight}; " f"Original Size: {len(train_d_list)}; Sampled Size: {len(cur_train_list)}" ) training_list.extend(cur_train_list) estimated_training_size = len(training_list) print("Estimated training size:", estimated_training_size) # Estimate the training size ends: # t_total = estimated_training_size // args.gradient_accumulation_steps * num_epoch t_total = estimated_training_size * num_epoch // args.actual_train_batch_size if args.warmup_steps <= 0: # set the warmup steps to 0.1 * total step if the given warmup step is -1. args.warmup_steps = int(t_total * 0.1) if not args.cpu: torch.cuda.set_device(args.local_rank) model.cuda(args.local_rank) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if not args.cpu and not args.single_gpu: model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) args_dict = dict(vars(args)) file_path_prefix = '.' if args.global_rank in [-1, 0]: print("Total Steps:", t_total) args.total_step = t_total print("Warmup Steps:", args.warmup_steps) print("Actual Training Batch Size:", actual_train_batch_size) print("Arguments", pp.pprint(args)) # Let build the logger and log everything before the start of the first training epoch. if args.global_rank in [ -1, 0 ]: # only do logging if we use cpu or global_rank=0 if not args.debug_mode: file_path_prefix, date = save_tool.gen_file_prefix( f"{args.experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save option file common.save_json(args_dict, os.path.join(file_path_prefix, "args.json")) checkpoints_path = Path(file_path_prefix) / "checkpoints" if not checkpoints_path.exists(): checkpoints_path.mkdir() prediction_path = Path(file_path_prefix) / "predictions" if not prediction_path.exists(): prediction_path.mkdir() global_step = 0 # print(f"Global Rank:{args.global_rank} ### ", 'Init!') for epoch in tqdm(range(num_epoch), desc="Epoch", disable=args.global_rank not in [-1, 0]): # Let's build up training dataset for this epoch training_list = [] for i in range(len(train_data_list)): print("Build Training Data ...") train_d_list = train_data_list[i] train_d_name = train_data_name[i] train_d_weight = train_data_weights[i] cur_train_list = sample_data_list( train_d_list, train_d_weight ) # change later # we can apply different sample strategy here. print( f"Data Name:{train_d_name}; Weight: {train_d_weight}; " f"Original Size: {len(train_d_list)}; Sampled Size: {len(cur_train_list)}" ) training_list.extend(cur_train_list) random.shuffle(training_list) train_dataset = NLIDataset(training_list, data_transformer) train_sampler = SequentialSampler(train_dataset) if not args.cpu and not args.single_gpu: print("Use distributed sampler.") train_sampler = DistributedSampler(train_dataset, args.world_size, args.global_rank, shuffle=True) train_dataloader = DataLoader( dataset=train_dataset, batch_size=batch_size_per_gpu_train, shuffle=False, # num_workers=0, pin_memory=True, sampler=train_sampler, collate_fn=BaseBatchBuilder(batching_schema)) # # training build finished. print(debug_node_info(args), "epoch: ", epoch) if not args.cpu and not args.single_gpu: train_sampler.set_epoch( epoch ) # setup the epoch to ensure random sampling at each epoch for forward_step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.global_rank not in [-1, 0]), 0): model.train() batch = move_to_device(batch, local_rank) # print(batch['input_ids'], batch['y']) if args.model_class_name in ["distilbert", "bart-large"]: outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['y']) else: outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'], token_type_ids=batch['token_type_ids'], labels=batch['y']) loss, logits = outputs[:2] # print(debug_node_info(args), loss, logits, batch['uid']) # print(debug_node_info(args), loss, batch['uid']) # Accumulated loss if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # if this forward step need model updates # handle fp16 if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Gradient clip: if max_grad_norm < 0 if (forward_step + 1) % args.gradient_accumulation_steps == 0: if args.max_grad_norm > 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.global_rank in [ -1, 0 ] and args.eval_frequency > 0 and global_step % args.eval_frequency == 0: r_dict = dict() # Eval loop: for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] cur_eval_data_list = eval_data_list[i] cur_eval_dataloader = eval_data_loaders[i] # cur_eval_raw_data_list = eval_raw_data_list[i] evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict, eval_name=cur_eval_data_name) # saving checkpoints current_checkpoint_filename = \ f'e({epoch})|i({global_step})' for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] current_checkpoint_filename += \ f'|{cur_eval_data_name}#({round(r_dict[cur_eval_data_name]["acc"], 4)})' if not args.debug_mode: # save model: model_output_dir = checkpoints_path / current_checkpoint_filename if not model_output_dir.exists(): model_output_dir.mkdir() model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), str(model_output_dir / "model.pt")) torch.save(optimizer.state_dict(), str(model_output_dir / "optimizer.pt")) torch.save(scheduler.state_dict(), str(model_output_dir / "scheduler.pt")) # save prediction: if not args.debug_mode and args.save_prediction: cur_results_path = prediction_path / current_checkpoint_filename if not cur_results_path.exists(): cur_results_path.mkdir(parents=True) for key, item in r_dict.items(): common.save_jsonl( item['predictions'], cur_results_path / f"{key}.jsonl") # avoid saving too many things for key, item in r_dict.items(): del r_dict[key]['predictions'] common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2) # End of epoch evaluation. if args.global_rank in [-1, 0]: r_dict = dict() # Eval loop: for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] cur_eval_data_list = eval_data_list[i] cur_eval_dataloader = eval_data_loaders[i] # cur_eval_raw_data_list = eval_raw_data_list[i] evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict, eval_name=cur_eval_data_name) # saving checkpoints current_checkpoint_filename = \ f'e({epoch})|i({global_step})' for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] current_checkpoint_filename += \ f'|{cur_eval_data_name}#({round(r_dict[cur_eval_data_name]["acc"], 4)})' if not args.debug_mode: # save model: model_output_dir = checkpoints_path / current_checkpoint_filename if not model_output_dir.exists(): model_output_dir.mkdir() model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), str(model_output_dir / "model.pt")) torch.save(optimizer.state_dict(), str(model_output_dir / "optimizer.pt")) torch.save(scheduler.state_dict(), str(model_output_dir / "scheduler.pt")) # save prediction: if not args.debug_mode and args.save_prediction: cur_results_path = prediction_path / current_checkpoint_filename if not cur_results_path.exists(): cur_results_path.mkdir(parents=True) for key, item in r_dict.items(): common.save_jsonl(item['predictions'], cur_results_path / f"{key}.jsonl") # avoid saving too many things for key, item in r_dict.items(): del r_dict[key]['predictions'] common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2)
def eval_model(model_path, data_file=None, filter_value=0.5): seed = 12 torch.manual_seed(seed) bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' bert_model_name = "bert-base-uncased" lazy = False forward_size = 16 batch_size = 32 do_lower_case = True debug = False max_pre_context_length = 320 max_query_length = 64 doc_stride = 128 qa_num_of_layer = 2 s_filter_value = filter_value s_top_k = 5 tag = 'dev' print("Potential total length:", max_pre_context_length + max_query_length + 3) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) # Load Dataset. dev_list = common.load_json(config.DEV_FULLWIKI_FILE) test_list = common.load_json(config.TEST_FULLWIKI_FILE) train_list = common.load_json(config.TRAIN_FILE) if data_file is None: dev_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) else: dev_sentence_level_results = common.load_jsonl(data_file) test_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl" ) train_sentence_level_results = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) dev_fitem_dict, dev_fitem_list, dev_sp_results_dict = get_qa_item_with_upstream_sentence( dev_list, dev_sentence_level_results, is_training=False, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) test_fitem_dict, test_fitem_list, test_sp_results_dict = get_qa_item_with_upstream_sentence( test_list, test_sentence_level_results, is_training=False, tokenizer=tokenizer, max_context_length=max_pre_context_length, max_query_length=max_query_length, filter_value=s_filter_value, doc_stride=doc_stride, top_k=s_top_k, debug_mode=debug) # train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence(train_list, train_sentence_level_results, # is_training=True, # tokenizer=tokenizer, # max_context_length=max_pre_context_length, # max_query_length=max_query_length, # filter_value=s_filter_value, # doc_stride=doc_stride, # top_k=s_top_k, # debug_mode=debug) if debug: dev_list = dev_list[:100] span_pred_reader = BertPairedSpanPredReader(bert_tokenizer=tokenizer, lazy=lazy, example_filter=None) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertSpan(bert_encoder, qa_num_of_layer) model.load_state_dict(torch.load(model_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) iterator = BasicIterator(batch_size=batch_size) if tag == 'dev': dev_instances = span_pred_reader.read(dev_fitem_list) # test_instances = span_pred_reader.read(test_fitem_list) eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False) # eval_iter = iterator(test_instances, num_epochs=1, shuffle=False) cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict, device_num, show_progress=True, pred_no_answer=True) # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, test_fitem_dict, # device_num, show_progress=True) cur_results_dict = dict() cur_results_dict['answer'] = cur_eval_dict cur_results_dict['sp'] = dev_sp_results_dict # cur_results_dict['sp'] = test_sp_results_dict # common.save_json(cur_results_dict, f"{tag}_qa_sp_results_{filter_value}_doctopk_5.json") cur_results_dict['p_answer'] = cur_eval_dict _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) # _, metrics = ext_hotpot_eval.eval(cur_results_dict, test_list, verbose=False) logging_item = { 'score': metrics, } print(data_file) print(logging_item) elif tag == 'test': # dev_instances = span_pred_reader.read(dev_fitem_list) test_instances = span_pred_reader.read(test_fitem_list) # eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False) eval_iter = iterator(test_instances, num_epochs=1, shuffle=False) # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict, # device_num, show_progress=True) cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, test_fitem_dict, device_num, show_progress=True) cur_results_dict = dict() cur_results_dict['answer'] = cur_eval_dict # cur_results_dict['sp'] = dev_sp_results_dict cur_results_dict['sp'] = test_sp_results_dict common.save_json(cur_results_dict, f"{tag}_qa_sp_results.json") cur_results_dict['p_answer'] = cur_eval_dict # _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False) _, metrics = ext_hotpot_eval.eval(cur_results_dict, test_list, verbose=False) logging_item = { 'score': metrics, } print(logging_item)
def evaluation(): parser = argparse.ArgumentParser() parser.add_argument("--cpu", action="store_true", help="If set, we only use CPU.") parser.add_argument("--model_class_name", type=str, help="Set the model class of the experiment.", required=True) parser.add_argument("--model_checkpoint_path", type=str, help='Set the path to save the prediction.', required=True) parser.add_argument("--output_prediction_path", type=str, default=None, help='Set the path to save the prediction.') parser.add_argument( "--per_gpu_eval_batch_size", default=16, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument("--max_length", default=156, type=int, help="Max length of the sequences.") parser.add_argument("--eval_data", type=str, help="The training data used in the experiments.") args = parser.parse_args() if args.cpu: args.global_rank = -1 else: args.global_rank = 0 model_checkpoint_path = args.model_checkpoint_path num_labels = 3 # we are doing NLI so we set num_labels = 3, for other task we can change this value. max_length = args.max_length model_class_item = MODEL_CLASSES[args.model_class_name] model_name = model_class_item['model_name'] do_lower_case = model_class_item[ 'do_lower_case'] if 'do_lower_case' in model_class_item else False tokenizer = model_class_item['tokenizer'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), do_lower_case=do_lower_case) model = model_class_item['sequence_classification'].from_pretrained( model_name, cache_dir=str(config.PRO_ROOT / "trans_cache"), num_labels=num_labels) model.load_state_dict(torch.load(model_checkpoint_path)) padding_token_value = tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0] padding_segement_value = model_class_item["padding_segement_value"] padding_att_value = model_class_item["padding_att_value"] left_pad = model_class_item[ 'left_pad'] if 'left_pad' in model_class_item else False batch_size_per_gpu_eval = args.per_gpu_eval_batch_size eval_data_str = args.eval_data eval_data_name = [] eval_data_path = [] eval_data_list = [] eval_data_named_path = eval_data_str.split(',') for named_path in eval_data_named_path: ind = named_path.find(':') name = named_path[:ind] path = name[ind + 1:] if name in registered_path: d_list = common.load_jsonl(registered_path[name]) else: d_list = common.load_jsonl(path) eval_data_name.append(name) eval_data_path.append(path) eval_data_list.append(d_list) batching_schema = { 'uid': RawFlintField(), 'y': LabelFlintField(), 'input_ids': ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad), 'token_type_ids': ArrayIndexFlintField(pad_idx=padding_segement_value, left_pad=left_pad), 'attention_mask': ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad), } data_transformer = NLITransform(model_name, tokenizer, max_length) eval_data_loaders = [] for eval_d_list in eval_data_list: d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler( eval_d_list, data_transformer, batching_schema, batch_size_per_gpu_eval) eval_data_loaders.append(d_dataloader) if not args.cpu: torch.cuda.set_device(0) model.cuda(0) r_dict = dict() # Eval loop: for i in range(len(eval_data_name)): cur_eval_data_name = eval_data_name[i] cur_eval_data_list = eval_data_list[i] cur_eval_dataloader = eval_data_loaders[i] # cur_eval_raw_data_list = eval_raw_data_list[i] evaluation_dataset(args, cur_eval_dataloader, cur_eval_data_list, model, r_dict, eval_name=cur_eval_data_name) # save prediction: if args.output_prediction_path is not None: cur_results_path = Path(args.output_prediction_path) if not cur_results_path.exists(): cur_results_path.mkdir(parents=True) for key, item in r_dict.items(): common.save_jsonl(item['predictions'], cur_results_path / f"{key}.jsonl") # avoid saving too many things for key, item in r_dict.items(): del r_dict[key]['predictions'] common.save_json(r_dict, cur_results_path / "results_dict.json", indent=2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_class", default="roberta", type=str, help="model class, one of [bert, roberta]") parser.add_argument("--dataset", type=str, default="wn18rr") parser.add_argument("--num_workers", default=12, type=int) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--data_dir", default=None, type=str) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--neg_weights", default=None, type=str) # extra parameters for prediction parser.add_argument("--no_verbose", action="store_true") parser.add_argument("--collect_prediction", action="store_true") parser.add_argument("--prediction_part", default="0,1", type=str) ## Other parameters define_hparams_training(parser) args = parser.parse_args() data_dir = args.data_dir or kgbert_data_dir # setup setup_prerequisite(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab if args.model_class == "roberta": config_class = RobertaConfig tokenizer_class = RobertaTokenizer model_class = RobertaForSequenceClassification elif args.model_class == "bert": config_class = BertConfig tokenizer_class = BertTokenizer model_class = BertForSequenceClassification else: raise KeyError(args.model_class) config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=2) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.model_name_or_path, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Dataset neg_weights = [1., 1., 0.] if args.neg_weights is None else [ float(_e) for _e in args.neg_weights.split(",") ] assert len(neg_weights) == 3 and sum(neg_weights) > 0 train_dataset = LinkPredictionDataset(args.dataset, "train", None, data_dir, args.model_class, tokenizer, args.do_lower_case, args.max_seq_length, neg_times=5, neg_weights=neg_weights) dev_dataset = LinkPredictionDataset( args.dataset, "dev", None, data_dir, args.model_class, tokenizer, args.do_lower_case, args.max_seq_length, ) test_dataset = LinkPredictionDataset( args.dataset, "test", None, data_dir, args.model_class, tokenizer, args.do_lower_case, args.max_seq_length, ) if args.do_train: train(args, train_dataset, model, tokenizer, eval_dataset=dev_dataset) if args.do_train and (args.do_eval or args.do_prediction): # load the best model model = model_class.from_pretrained(args.output_dir, config=config) model.to(args.device) if not args.do_train and args.do_eval: pass if args.fp16: model = setup_eval_model_for_fp16(args, model) dataset_list = [train_dataset, dev_dataset, test_dataset] if not args.do_train and args.do_prediction: path_template = join(args.output_dir, "tuple_ranks_{},{}.json") part_param = args.prediction_part.split(",") part_param = [int(_e) for _e in part_param] assert len(part_param) == 2 and part_param[1] > part_param[0] >= 0 cur_part_idx, num_parts = part_param if args.collect_prediction: tuple_ranks_list = [] for _idx in range(num_parts): tuple_ranks_list.append( load_json(path_template.format(_idx, num_parts))) tuple_ranks = combine_from_lists(tuple_ranks_list, ordered=True) output_str = calculate_metrics_for_link_prediction(tuple_ranks) with open(join(args.output_dir, "link_prediction_metrics.txt"), "w", encoding="utf-8") as fp: fp.write(output_str) else: test_raw_examples = test_dataset.raw_examples # part tgt_raw_examples = [ _ex for _idx, _ex in enumerate(test_raw_examples) if _idx % num_parts == cur_part_idx ] # evaluate(args, test_dataset, model, tokenizer, None, "test_") tuple_ranks = predict(args, tgt_raw_examples, dataset_list, model, verbose=(not args.no_verbose)) calculate_metrics_for_link_prediction(tuple_ranks, verbose=True) save_json(tuple_ranks, path_template.format(cur_part_idx, num_parts))
def eval_model_for_downstream_ablation(model_saved_path, doc_top_k=2, tag='dev'): print(f"Run doc_top_k:{doc_top_k}") bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True # forward_size = 256 forward_size = 256 # batch_size = 64 batch_size = 128 do_lower_case = True document_top_k = doc_top_k debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) test_list = common.load_json(config.TEST_FULLWIKI_FILE) # Load train eval results list # cur_train_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl") cur_dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) # cur_test_eval_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" # "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl") # if tag == 'train': # train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True, # debug_mode=debug_mode) if tag == 'dev': dev_fitems = get_sentence_pair(document_top_k, dev_list, cur_dev_eval_results_list, is_training=False, debug_mode=debug_mode) # elif tag == 'test': # test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False, # debug_mode=debug_mode) if debug_mode: eval_frequency = 2 # dev_list = dev_list[:10] # dev_fitems_list = dev_fitems_list[:296] # train_fitems_list = train_fitems_list[:300] # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=128, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # if tag == 'train': train_instance = bert_cs_reader.read(train_fitems) elif tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'train': train_iter = biterator(train_instance, num_epochs=1, shuffle=False) print(len(train_fitems)) elif tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) print(len(dev_fitems)) elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(test_fitems)) print("Forward size:", forward_size) if tag == 'train': cur_train_eval_results_list_out = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_train_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl" ) elif tag == 'dev': cur_dev_eval_results_list_out = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_dev_eval_results_list_out, f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl") elif tag == 'test': cur_test_eval_results_list_out = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl( cur_test_eval_results_list_out, config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl" ) if tag == 'train' or tag == 'test': exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] common.save_json( logging_item, f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
def eval_model_for_downstream_ablation(model_saved_path, top_k_doc): bert_model_name = 'bert-base-uncased' lazy = True # lazy = True forward_size = 128 # batch_size = 64 # batch_size = 128 do_lower_case = True debug_mode = False max_l = 128 # est_datasize = 900_000 tag = 'dev' num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl" ) dev_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl" ) test_upstream_doc_results = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/" "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_test_results.jsonl" ) train_list = common.load_jsonl(config.FEVER_TRAIN) dev_list = common.load_jsonl(config.FEVER_DEV) test_list = common.load_jsonl(config.FEVER_TEST) # dev_list = common.load_jsonl(config.FEVER_DEV) if tag == 'dev': dev_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'dev', dev_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(dev_fitems, None) elif tag == 'train': train_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'train', train_upstream_doc_results, is_training=True, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(train_fitems, None) elif tag == 'test': test_fitems = fever_s_level_sampler.get_sentence_forward_pair( 'test', test_upstream_doc_results, is_training=False, debug=debug_mode, ignore_non_verifiable=False, top_k=top_k_doc, filter_value=0.00000) fever_p_level_sampler.down_sample_neg(test_fitems, None) # Just to show the information if debug_mode: dev_list = dev_list[:100] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id') test_o_dict = list_dict_data_tool.list_to_dict(test_list, 'id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader( bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=max_l, element_fieldname='element') bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) if tag == 'dev': dev_instances = bert_cs_reader.read(dev_fitems) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl( cur_eval_results_list, f"fever_s_level_{tag}_results_top_k_doc_{top_k_doc}.jsonl") copied_dev_o_dict = copy.deepcopy(dev_o_dict) copied_dev_d_list = copy.deepcopy(dev_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_dev_d_list, dev_list, max_evidence=5) score_05 = { 'top_k_doc': top_k_doc, 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print("Top_k doc:", top_k_doc) print(score_05) common.save_json( score_05, f"top_k_doc:{top_k_doc}_ss:{strict_score}_pr:{pr}_rec:{rec}_f1:{f1}" ) elif tag == 'test': test_instances = bert_cs_reader.read(test_fitems) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_s_level_{tag}_results.jsonl") # copied_test_o_dict = copy.deepcopy(test_o_dict) # copied_test_d_list = copy.deepcopy(test_list) # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_test_o_dict, # 'qid', 'fid', check=True) # # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_test_o_dict, # score_field_name='prob', # top_k=5, filter_value=0.5, # result_field='predicted_evidence') # # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_test_d_list, # cur_results_dict_th0_5, # 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} # copied_train_o_dict = copy.deepcopy(train_o_dict) # copied_train_d_list = copy.deepcopy(train_list) # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_train_o_dict, # 'qid', 'fid', check=True) # # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_train_o_dict, # score_field_name='prob', # top_k=5, filter_value=0.5, # result_field='predicted_evidence') # # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_train_d_list, # cur_results_dict_th0_5, # 'id', 'predicted_evidence') # # mode = {'standard': False, 'check_doc_id_correct': True} # strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(copied_train_d_list, train_list, # max_evidence=5) # score_05 = { # 'ss': strict_score, # 'pr': pr, 'rec': rec, 'f1': f1, # } # # print(score_05) elif tag == 'train': train_instances = bert_cs_reader.read(train_fitems) train_iter = biterator(train_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, train_iter, device_num, make_int=True, with_probs=True, show_progress=True) common.save_jsonl(cur_eval_results_list, f"fever_s_level_{tag}_results.jsonl") copied_train_o_dict = copy.deepcopy(train_o_dict) copied_train_d_list = copy.deepcopy(train_list) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_train_o_dict, 'qid', 'fid', check=True) cur_results_dict_th0_5 = select_top_k_and_to_results_dict( copied_train_o_dict, score_field_name='prob', top_k=5, filter_value=0.5, result_field='predicted_evidence') list_dict_data_tool.append_item_from_dict_to_list_hotpot_style( copied_train_d_list, cur_results_dict_th0_5, 'id', 'predicted_evidence') # mode = {'standard': False, 'check_doc_id_correct': True} strict_score, pr, rec, f1 = fever_scorer.fever_sent_only( copied_train_d_list, train_list, max_evidence=5) score_05 = { 'ss': strict_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(score_05)
def experiment_train_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 multihop_strict_mode = True debug_mode = None # data_list = common.load_json(config.DEV_FULLWIKI_FILE) data_list = common.load_json(config.TRAIN_FILE) if debug_mode is not None: data_list = data_list[:debug_mode] terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_train.jsonl" ) g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") doc_retri_pred_dict = init_results_v8( data_list, data_list, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) # common.save_json(doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_before_multihop_filtering_{debug_mode}.json") common.save_json( doc_retri_pred_dict, f"hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k, strict_mode=multihop_strict_mode) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) # common.save_json(new_doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_{debug_mode}.json") common.save_json(new_doc_retri_pred_dict, f"hotpot_train_doc_retrieval_v8.json")
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5): bert_model_name = 'bert-base-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' lazy = False forward_size = 32 do_lower_case = True pair_order = 'cq' debug_mode = False maxout_model = False num_class = 3 tag = 'dev' exp = 'no_re_train' print("Filter value:", filter_value) print("top_k_sent:", top_k_sent) train_sent_filtering_prob = 0.2 dev_sent_filtering_prob = filter_value test_sent_filtering_prob = 0.2 # Data dataset and upstream sentence results. dev_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl") # train_sent_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl") test_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl") dev_fitems, dev_list = get_nli_pair('dev', is_training=False, sent_level_results_list=dev_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob) # train_fitems, train_list = get_nli_pair('train', is_training=True, # sent_level_results_list=train_sent_results_list, debug=debug_mode, # sent_top_k=5, sent_filter_value=train_sent_filtering_prob) test_fitems, test_list = get_nli_pair('test', is_training=False, sent_level_results_list=test_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') if debug_mode: dev_list = dev_list[:100] # train_list = train_list[:100] test_list = test_list[:100] eval_frequency = 2 # est_datasize = len(train_fitems) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=384, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) if not maxout_model: model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) else: model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2) model.load_state_dict(torch.load(model_path)) dev_instances = bert_cs_reader.read(dev_fitems) # train_instances = bert_cs_reader.read(train_fitems) test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl") mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item) common.save_json(logging_item, f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl") elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_test_list = copy.deepcopy(test_list) list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")