def precompute_forward_items_and_cache(): # 3 places need to switch from dev to train !!! is_training = False doc_results = common.load_json( # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json") config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_test_doc_retrieval_v8_before_multihop_filtering.json" ) doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB) t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB) # data_list = common.load_json(config.DEV_FULLWIKI_FILE) data_list = common.load_json(config.TEST_FULLWIKI_FILE) # data_list = common.load_json(config.TRAIN_FILE) append_baseline_context(doc_results, data_list) fitem_list = build_full_wiki_document_forward_item(doc_results, data_list, is_training, t_db_cursor, True) print(len(fitem_list)) common.save_jsonl( fitem_list, config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl")
def results_analysis(): doc_results = common.load_json( # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json") config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json" ) doc_results = results_multihop_filtering(doc_results, multihop_retrieval_top_k=3, strict_mode=True) # terms_based_results_list = common.load_jsonl( # config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") data_list = common.load_json(config.DEV_FULLWIKI_FILE) # data_list = common.load_json(config.TRAIN_FILE) append_baseline_context(doc_results, data_list) len_list = [] for rset in doc_results['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) ext_hotpot_eval.eval(doc_results, data_list)
def toy_init_results(): dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print(len(dev_fullwiki_list)) # Load rindex file abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() abs_rindexdb.score_db['default-tf-idf'] = dict() load_from_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # Load rindex finished saved_items = [] for item in tqdm(dev_fullwiki_list): saved_tfidf_item = dict() question = item['question'] qid = item['_id'] doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50) saved_tfidf_item['question'] = question saved_tfidf_item['qid'] = qid saved_tfidf_item['doc_list'] = doc_list saved_items.append(saved_tfidf_item) common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
def batchnize_dataset(self, data, batch_size=None, shuffle=True): batches = [] max_span_len = self.config["max_span_len"] dataset = load_json(data) if shuffle: random.shuffle(dataset) dataset.sort(key=lambda record: len(record["words"])) prev_seq_len = len(dataset[0]["words"]) batch_words, batch_chars, batch_tags = [], [], [] for record in dataset: seq_len = len(record["words"]) if len(batch_words) == batch_size or prev_seq_len != seq_len: batches.append( self.make_each_batch(batch_words, batch_chars, max_span_len, batch_tags)) batch_words, batch_chars, batch_tags = [], [], [] prev_seq_len = seq_len batch_words.append(record["words"]) batch_chars.append(record["chars"]) batch_tags.append(record["tags"]) if len(batch_words) > 0: batches.append( self.make_each_batch(batch_words, batch_chars, max_span_len, batch_tags)) if shuffle: random.shuffle(batches) for batch in batches: yield batch
def model_perf_binned(dataset_name, task_name, data_file, model_prediction_file, split_type='quantile', bin_num=5, verbose=True): d_list = common.load_jsonl(data_file) collected_data_dict = list_dict_data_tool.list_to_dict(d_list, key_fields='uid') model_prediction_dict = common.load_json(model_prediction_file) bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type) bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict, task_name=task_name) if verbose: print('-' * 60) print('Data:', dataset_name) for model_name, range_items in bined_item_results.items(): print('Model: {:20s}'.format(model_name)) print('\t'.join(['{:18s}'.format('Entropy Range'), '{:15s}'.format('# of Example'), '{:10s}'.format('JSD'), '{:10s}'.format('KL'), '{:10s}'.format('Old Acc.'), '{:10s}'.format('New Acc.')])) for range_value, model_item in range_items['bin_results'].items(): print('\t'.join(['{:5f}-{:5f}'.format(range_value[0], range_value[1]), '{:15s}'.format(format_number(model_item['total_count'])), '{:10s}'.format(format_number(model_item['average JS div'])), '{:10s}'.format(format_number(model_item['average KL div'])), '{:10s}'.format(format_number(model_item['o_acc'])), '{:10s}'.format(format_number(model_item['m_acc'])), ])) print('-' * 60) return bined_item_results
def show_nli_binned_plot(y_axis_value): dataset_name = 'Natural Language Inference' task_name = 'uncertainty_nli' snli_data_file = config.CHAOSNLI_SNLI mnli_data_file = config.CHAOSNLI_MNLI model_pred_file = config.MODEL_PRED_NLI d_list_snli = common.load_jsonl(snli_data_file) d_list_mnli = common.load_jsonl(mnli_data_file) collected_data_dict = {} collected_data_dict_snli = list_dict_data_tool.list_to_dict(d_list_snli, key_fields='uid') collected_data_dict_mnli = list_dict_data_tool.list_to_dict(d_list_mnli, key_fields='uid') collected_data_dict.update(collected_data_dict_snli) collected_data_dict.update(collected_data_dict_mnli) model_prediction_dict = common.load_json(model_pred_file) bin_num = 5 split_type = 'quantile' column_name = 'ChaosNLI-(S+M)' bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type) bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict, task_name=task_name) plot_histogram(bined_item_results, y_axis_value, column_name)
def get_sp_position_count(): train_list = common.load_json(config.TRAIN_FILE) c = Counter() for item in train_list: sp_position_analysis(item, c) print(c)
def experiment_test_full_wiki(): multihop_retrieval_top_k = 3 match_filtering_k = 3 term_retrieval_top_k = 5 data_list = common.load_json(config.TEST_FULLWIKI_FILE) terms_based_results_list = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl" ) g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # WE need to give gt data None. doc_retri_pred_dict = init_results_v8( data_list, None, terms_based_results_list, g_score_dict, match_filtering_k=match_filtering_k, term_retrieval_top_k=term_retrieval_top_k) len_list = [] for rset in doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results without filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) common.save_json( doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json") # Filtering new_doc_retri_pred_dict = results_multihop_filtering( doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k) print("Results with filtering:") len_list = [] for rset in new_doc_retri_pred_dict['sp_doc'].values(): len_list.append(len(rset)) print("Results with filtering:") print(collections.Counter(len_list).most_common(10000)) print(len(len_list)) print("Mean:\t", np.mean(len_list)) print("Std:\t", np.std(len_list)) print("Max:\t", np.max(len_list)) print("Min:\t", np.min(len_list)) # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list) common.save_json(new_doc_retri_pred_dict, "hotpot_test_doc_retrieval_v8.json")
def logging_to_file(self, filename): if Path(filename).is_file(): old_logging_list = common.load_json(filename) current_saved_key = set() for item in self.logging_item_list: current_saved_key.add(item['k']) for item in old_logging_list: if item['k'] not in current_saved_key: raise ValueError("Previous logged item can not be found!") common.save_json(self.logging_item_list, filename, indent=2, sort_keys=True)
def batchnize_dataset(self, data, data_name=None, batch_size=None, shuffle=True): max_span_len = self.config["max_span_len"] if data_name == "train": max_n_spans = self.config["max_n_spans"] else: if self.config["max_n_spans"] > 0: max_n_spans = 1000000 else: max_n_spans = 0 dataset = load_json(data) for instance_id, record in enumerate(dataset): record["instance_id"] = instance_id if shuffle: random.shuffle(dataset) dataset.sort(key=lambda record: len(record["words"])) batches = [] batch_words, batch_chars, batch_tags, batch_ids = [], [], [], [] prev_seq_len = len(dataset[0]["words"]) for record in dataset: seq_len = len(record["words"]) if len(batch_words) == batch_size or prev_seq_len != seq_len: batches.append( self.make_each_batch_for_targets(batch_words, batch_chars, batch_ids, max_span_len, max_n_spans, batch_tags)) batch_words, batch_chars, batch_tags, batch_ids = [], [], [], [] prev_seq_len = seq_len batch_words.append(record["words"]) batch_chars.append(record["chars"]) batch_tags.append(record["tags"]) batch_ids.append(record["instance_id"]) if len(batch_words) > 0: batches.append( self.make_each_batch_for_targets(batch_words, batch_chars, batch_ids, max_span_len, max_n_spans, batch_tags)) if shuffle: random.shuffle(batches) for batch in batches: yield batch
def get_sample_data(size=-1): qa_gt_s = common.load_json(config.FEVER_DATA_ROOT / "qa_aug" / "squad_train_turker_groundtruth.json") # print(len(qa_gt_s)) qa_aug_rnei = common.load_json( config.FEVER_DATA_ROOT / "qa_aug" / "squad_train_refutes_bytype_3x_claim_stoch_answspan_stoch.json") # print(len(qa_aug_rnei)) random.shuffle(qa_aug_rnei) for item in qa_aug_rnei: sv = random.random() if sv > 0.5: item['label'] = "REFUTES" else: item['label'] = "NOT ENOUGH INFO" balanced_aug_data = qa_gt_s + qa_aug_rnei[:len(qa_gt_s) * 2] print("Total balanced size:", len(balanced_aug_data)) random.shuffle(balanced_aug_data) if size != -1: return balanced_aug_data[:size] else: return balanced_aug_data
def load_dataset(self, filename, keep_number=False, lowercase=True): dataset = [] for record in load_json(filename): words = [ word_convert(word, keep_number=keep_number, lowercase=lowercase) for word in record["words"] ] dataset.append({ "sent_id": record["sent_id"], "words": words, "tags": record["spans"] }) return dataset
def get_train_sentence_pair(top_k, is_training, debug=False, cur_train_eval_results_list=None): train_list = common.load_json(config.TRAIN_FILE) if cur_train_eval_results_list is None: cur_train_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/" "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl") if debug: train_list = train_list[:100] id_set = set([item['_id'] for item in train_list]) cur_train_eval_results_list = [item for item in cur_train_eval_results_list if item['qid'] in id_set] return get_sentence_pair(top_k, train_list, cur_train_eval_results_list, is_training)
def full_wiki_baseline_upperbound(): dev_fullwiki = common.load_json(config.DEV_FULLWIKI_FILE) # dev_fullwiki = common.load_json(config.DEV_DISTRACTOR_FILE) upperbound_pred_file = dict() upperbound_pred_file['sp'] = dict() upperbound_pred_file['sp_doc'] = dict() upperbound_pred_file['p_answer'] = dict() # print(dev_fullwiki) for item in dev_fullwiki: qid = item['_id'] answer = item['answer'] contexts = item['context'] supporting_facts = item['supporting_facts'] # supporting_doc = set([fact[0] for fact in item['supporting_facts']]) # retrieved_doc_dict = set([context[0] for context in contexts]) retrieved_doc_dict = dict() for doc_title, context_sents in contexts: if doc_title not in retrieved_doc_dict: retrieved_doc_dict[doc_title] = dict() for i, sent in enumerate(context_sents): retrieved_doc_dict[doc_title][i] = sent upperbound_pred_doc = [] upperbound_pred_sp = [] found_answer = False for sp_doc, sp_fact_line_num in supporting_facts: if sp_doc in retrieved_doc_dict and sp_fact_line_num in retrieved_doc_dict[sp_doc]: upperbound_pred_doc.append(sp_doc) upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: found_answer = True p_answer = answer if found_answer else "" upperbound_pred_file['sp'][qid] = upperbound_pred_sp upperbound_pred_file['sp_doc'][qid] = upperbound_pred_doc upperbound_pred_file['p_answer'][qid] = p_answer if all([gt_fact in upperbound_pred_sp for gt_fact in supporting_facts]): # If we find all the evidence, to add additional yes/no answer. upperbound_pred_file['p_answer'][qid] = answer ext_hotpot_eval.eval(upperbound_pred_file, dev_fullwiki)
def term_based_doc_retri(hotpot_set): fullwiki_list = common.load_json(hotpot_set) print("{} questions".format(len(fullwiki_list))) retri_list = [] for item in tqdm(fullwiki_list): saved_tfidf_item = dict() question = item['question'] qid = item['_id'] doc_list = lucene_retri_doc(question, top_k=50) saved_tfidf_item['question'] = question saved_tfidf_item['qid'] = qid saved_tfidf_item['doc_list'] = doc_list retri_list.append(saved_tfidf_item) return retri_list
def eval_hotpot_s(): cur_dev_eval_results_list_out = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpot_p_level_effects/hotpot_s_level_dev_results_top_k_doc_100.jsonl" ) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_dev_eval_results_list_out, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 cur_results_dict_v05 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.5, result_field='sp') # cur_results_dict_v02 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5, # score_field_name='prob', # filter_value=0.2, # result_field='sp') _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) # _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) logging_item = { # 'v02': metrics_v2, 'v05': metrics_v5, } print(logging_item) f1 = metrics_v5['sp_f1'] em = metrics_v5['sp_em'] pr = metrics_v5['sp_prec'] rec = metrics_v5['sp_recall'] print(em, pr, rec, f1)
def inspect_upstream_eval(): dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') dev_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl" ) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # 0.5 # cur_results_dict_v05 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5, # score_field_name='prob', # filter_value=0.5, # result_field='sp') cur_results_dict_v02 = select_top_k_and_to_results_dict( copied_dev_o_dict, top_k=5, score_field_name='prob', filter_value=0.2, result_field='sp') # _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False) _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False) v02_sp_f1 = metrics_v2['sp_f1'] v02_sp_recall = metrics_v2['sp_recall'] v02_sp_prec = metrics_v2['sp_prec'] v05_sp_f1 = metrics_v5['sp_f1'] v05_sp_recall = metrics_v5['sp_recall'] v05_sp_prec = metrics_v5['sp_prec'] logging_item = { 'label': 'ema', 'v02': metrics_v2, # 'v05': metrics_v5, } print(logging_item)
def load_and_eval(): top_k = 50 value_thrsehold = None tf_idf_dev_results = common.load_jsonl(config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") doc_pred_dict = {'sp_doc': dict()} for item in tqdm(tf_idf_dev_results): sorted_scored_list = sorted(item['doc_list'], key=lambda x: x[0], reverse=True) pred_list = [docid for _, docid in sorted_scored_list[:top_k]] # print(sorted_scored_list) qid = item['qid'] doc_pred_dict['sp_doc'][qid] = pred_list # break dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) ext_hotpot_eval.eval(doc_pred_dict, dev_fullwiki_list)
def eval_p_level(): cur_eval_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl" ) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict( cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) print(metrics_top5)
def train_knn_epoch(self, batches, name): loss_total = 0. num_batches = 0 start_time = time.time() train_sents = load_json(self.cfg["train_set"]) if self.cfg["knn_sampling"] == "random": train_sent_ids = [sent_id for sent_id in range(len(train_sents))] else: train_sent_ids = None for batch in batches: num_batches += 1 if num_batches % 100 == 0: print("%d" % num_batches, flush=True, end=" ") # Setup a batch batch = self._add_neighbor_instances_to_batch(batch, train_sents, train_sent_ids, is_train=True) # Convert a batch to the input format feed_dict = self._get_feed_dict(batch, is_train=True, keep_prob=self.cfg["keep_prob"], lr=self.cfg["lr"]) # Train a model _, train_loss = self.sess.run([self.train_op, self.loss], feed_dict) if math.isnan(train_loss): self.logger.info("\n\n\nNAN: Index: %d\n" % num_batches) exit() loss_total += train_loss avg_loss = loss_total / num_batches self.logger.info("-- Time: %f seconds" % (time.time() - start_time)) self.logger.info("-- Averaged loss: %f(%f/%d)" % (avg_loss, loss_total, num_batches)) return avg_loss, loss_total
def _initialize_config(self): # create folders and logger os.makedirs(self.cfg["checkpoint_path"], exist_ok=True) os.makedirs(os.path.join(self.cfg["summary_path"]), exist_ok=True) self.logger = get_logger( os.path.join(self.cfg["checkpoint_path"], "log.txt")) # load dictionary dict_data = load_json(self.cfg["vocab"]) self.word_dict = dict_data["word_dict"] self.char_dict = dict_data["char_dict"] self.tag_dict = dict_data["tag_dict"] del dict_data self.word_vocab_size = len(self.word_dict) self.char_vocab_size = len(self.char_dict) self.tag_vocab_size = len(self.tag_dict) self.rev_word_dict = dict([(idx, word) for word, idx in self.word_dict.items()]) self.rev_char_dict = dict([(idx, char) for char, idx in self.char_dict.items()]) self.rev_tag_dict = dict([(idx, tag) for tag, idx in self.tag_dict.items()])
def evaluate_knn_epoch(self, batches, name): correct = 0 p_total = 0 num_batches = 0 start_time = time.time() train_sents = load_json(self.cfg["train_set"]) if self.cfg["knn_sampling"] == "random": train_sent_ids = [sent_id for sent_id in range(len(train_sents))] else: train_sent_ids = None for batch in batches: num_batches += 1 if num_batches % 100 == 0: print("%d" % num_batches, flush=True, end=" ") # Setup a batch batch = self._add_neighbor_instances_to_batch(batch, train_sents, train_sent_ids, is_train=False) # Convert a batch to the input format feed_dict = self._get_feed_dict(batch) # Classify spans predicted_tags = self.sess.run([self.predicts], feed_dict)[0] crr_i, p_total_i = count_gold_and_system_outputs( batch["tags"], predicted_tags, NULL_LABEL_ID) correct += crr_i p_total += p_total_i p, r, f = f_score(correct, p_total, self.n_gold_spans) self.logger.info("-- Time: %f seconds" % (time.time() - start_time)) self.logger.info( "-- {} set\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})" .format(name, f, p, correct, p_total, r, correct, self.n_gold_spans)) return f, p, r, correct, p_total, self.n_gold_spans
def model_perf(dataset_name, task_name, data_file, model_prediction_file): d_list = common.load_jsonl(data_file) collected_data_dict = list_dict_data_tool.list_to_dict(d_list, key_fields='uid') model_prediction_dict = common.load_json(model_prediction_file) results_dict, all_correct_set = calculate_divergence_bwt_model_human_simplify( collected_data_dict, model_prediction_dict, task_name) print('-' * 60) print('Data:', dataset_name) print("All Correct Count:", len(all_correct_set)) print('\t'.join([ '{:20s}'.format('Model Name'), '{:10s}'.format('JSD'), '{:10s}'.format('KL'), '{:10s}'.format('Old Acc.'), '{:10s}'.format('New Acc.') ])) for model_name, model_item in results_dict.items(): print('\t'.join([ '{:20s}'.format(model_name), '{:10s}'.format(format_number(model_item['average JS div'])), '{:10s}'.format(format_number(model_item['average KL div'])), '{:10s}'.format(format_number(model_item['o_acc'])), '{:10s}'.format(format_number(model_item['m_acc'])), ])) print('-' * 60)
def inspect_sampler_squad_examples(): bert_model_name = "bert-base-uncased" bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' do_lower_case = True max_pre_context_length = 315 max_query_length = 64 doc_stride = 128 debug = True tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) squad_train_v2 = common.load_json(config.SQUAD_TRAIN_2_0) train_eitem_list = preprocessing_squad(squad_train_v2) train_fitem_dict, train_fitem_list = eitems_to_fitems( train_eitem_list, tokenizer, is_training=False, max_tokens_for_doc=max_pre_context_length, doc_stride=doc_stride, debug=debug) print(len(train_fitem_list))
def batchnize_dataset(self, data, batch_size=None, shuffle=True): max_span_len = self.config["max_span_len"] max_n_spans = None dataset = load_json(data) if shuffle: random.shuffle(dataset) batch_words, batch_chars, batch_tags = [], [], [] for record in dataset: if len(batch_words) == batch_size: yield self.make_each_batch(batch_words, batch_chars, max_span_len, max_n_spans, batch_tags) batch_words, batch_chars, batch_tags = [], [], [] batch_words.append(record["words"]) batch_chars.append(record["chars"]) batch_tags.append(record["tags"]) if len(batch_words) > 0: yield self.make_each_batch(batch_words, batch_chars, max_span_len, max_n_spans, batch_tags)
def eval_model_for_downstream(model_saved_path): seed = 12 torch.manual_seed(seed) bert_model_name = 'bert-base-uncased' # lazy = False lazy = True forward_size = 32 # batch_size = 64 batch_size = 128 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") test_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id') bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.load_state_dict(torch.load(model_saved_path)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # dev_instances = bert_cs_reader.read(dev_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) test_instances = bert_cs_reader.read(test_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # train_iter = biterator(train_instance, num_epochs=1, shuffle=False) # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) test_iter = biterator(test_instances, num_epochs=1, shuffle=False) print(len(dev_fitems_list)) print(len(test_fitems_list)) print(len(train_fitems_list)) # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True) # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True) cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True) common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl") print("Test write finished.") exit(0) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_3 cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3) upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top3, dev_list) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False) _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) logging_item = { 'top3': metrics_top3, 'top3_UB': metrics_top3_UB, 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } print(logging_item) common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl") common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
def model_go(): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' experiment_name = 'hotpot_v0_cs' lazy = False # lazy = True forward_size = 16 # batch_size = 64 batch_size = 128 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 5000 pos_ratio = 0.2 do_lower_case = True debug_mode = False # est_datasize = 900_000 num_class = 1 # num_train_optimization_steps device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace("false", namespace="labels") # 0 vocab.add_token_to_namespace("true", namespace="labels") # 1 vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Load Dataset train_list = common.load_json(config.TRAIN_FILE) dev_list = common.load_json(config.DEV_FULLWIKI_FILE) dev_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl") train_fitems_list = common.load_jsonl( config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl") if debug_mode: dev_list = dev_list[:10] dev_fitems_list = dev_fitems_list[:296] train_fitems_list = train_fitems_list[:300] eval_frequency = 2 # print(dev_list[-1]['_id']) # exit(0) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) est_datasize = len(sampled_train_list) dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id') # print(dev_o_dict) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case) bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True, example_filter=lambda x: len(x['context']) == 0, max_l=286) bert_encoder = BertModel.from_pretrained(bert_model_name) model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) dev_instances = bert_cs_reader.read(dev_fitems_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio) train_instance = bert_cs_reader.read(sampled_train_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True) copied_dev_o_dict = copy.deepcopy(dev_o_dict) list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True) # Top_5 cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5) upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top5, dev_list) cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10) upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri( cur_results_dict_top10, dev_list) _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False) _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False) _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False) _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False) # top5_doc_f1, top5_UB_sp_f1, top10_doc_f1, top10_Ub_sp_f1 # top5_doc_f1 = metrics_top5['doc_f1'] # top5_UB_sp_f1 = metrics_top5_UB['sp_f1'] # top10_doc_f1 = metrics_top10['doc_f1'] # top10_Ub_sp_f1 = metrics_top10_UB['sp_f1'] top5_doc_recall = metrics_top5['doc_recall'] top5_UB_sp_recall = metrics_top5_UB['sp_recall'] top10_doc_recall = metrics_top10['doc_recall'] top10_Ub_sp_recall = metrics_top10_UB['sp_recall'] logging_item = { 'top5': metrics_top5, 'top5_UB': metrics_top5_UB, 'top10': metrics_top10, 'top10_UB': metrics_top10_UB, } # print(logging_item) save_file_name = f'i({update_step})|e({epoch_i})' \ f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \ f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})' # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = model.module if hasattr(model, 'module') else model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
def model_go(): for some_params in [0]: # bert_model_name = 'bert-large-uncased' seed = 6 bert_model_name = 'bert-base-uncased' lazy = False forward_size = 16 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 learning_rate = 5e-5 num_train_epochs = 4 do_ema = False dev_prob_threshold = 0.1 train_prob_threshold = 0.35 debug_mode = False experiment_name = f"fever_nli_bert_maxout_l4_on_fulldata" # experiment_name = f"bert_fever_nli_baseline_on_fulldata_aug_the_same_gt_mrate({some_params})" # experiment_name = f"bert_fever_nli_baseline_on_10p_aug_the_same_gt_mrate({some_params})" # data_aug = True data_aug = False data_aug_file = config.FEVER_DATA_ROOT / "qa_aug/squad_train_turker_groundtruth.json" # data_aug_size = int(21_015 * some_params) # 10p data_aug_size = int(208_346 * some_params) # training_file = config.FEVER_DATA_ROOT / "fever_1.0/train_10.jsonl" training_file = config.FEVER_DATA_ROOT / "fever_1.0/train.jsonl" train_sample_top_k = 8 # est_datasize = 208_346 # full # est_datasize = 14_544 # est_datasize = 21_015 + data_aug_size # 10p est_datasize = 208_346 + data_aug_size num_class = 3 # num_train_optimization_steps torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') # Finished build vocabulary. # Load standardized sentence file dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl") dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique( config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl", dev_upstream_sent_list, prob_threshold=dev_prob_threshold, top_n=5) dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval( config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl", dev_sent_after_threshold_filter, None, tokenized=True) # print(dev_data_list[0]) # exit(0) train_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT / "upstream_sentence_selection_Feb16/train_sent_scores.jsonl") # Finished loading standardized sentence file. bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True) bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy) dev_instances = bert_fever_reader.read(dev_data_list) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) # print(list(mnli_dev_instances)) # Load training model # Load training model # model_clf = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_class) bert_encoder = BertModel.from_pretrained(bert_model_name) model_clf = BertPairMaxOutMatcher(bert_encoder, num_of_class=3) ema_tracker = None ema_model_copy = None if do_ema and ema_tracker is None: ema_tracker = EMA(model_clf.named_parameters(), on_cpu=True) ema_model_copy = copy.deepcopy(model_clf) model_clf.to(device) param_optimizer = list(model_clf.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs print(num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) # optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate) # # # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end model_clf.train() if n_gpu > 1: model_clf = nn.DataParallel(model_clf) forbackward_step = 0 update_step = 0 eval_iter_num = 2_000 # Change this to real evaluation. best_fever_score = -1 for n_epoch in range(num_train_epochs): print("Resampling...") train_sent_after_threshold_filter = \ fever_ss_sampler.threshold_sampler_insure_unique(training_file, train_upstream_sent_list, train_prob_threshold, top_n=train_sample_top_k) # train_data_list = fever_nli_sampler.adv_simi_sample_with_prob_v1_1( training_file, train_sent_after_threshold_filter, None, tokenized=True) aug_d_list = [] if data_aug: aug_d_list = common.load_json(data_aug_file) random.shuffle(aug_d_list) aug_d_list = aug_d_list[:data_aug_size] train_data_list = train_data_list + aug_d_list random.shuffle(train_data_list) print("Sample data length:", len(train_data_list)) sampled_train_instances = bert_fever_reader.read(train_data_list) # train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1) for i, batch in enumerate(tqdm(train_iter)): paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] paired_sequence = paired_sequence.to(device) paired_segments_ids = paired_segments_ids.to(device) labels_ids = labels_ids.to(device) att_mask = att_mask.to(device) s1_span = s1_span.to(device) s2_span = s2_span.to(device) loss = model_clf(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, s1_span=s1_span, s2_span=s2_span, mode=BertPairMaxOutMatcher.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() update_step += 1 if do_ema and ema_tracker is not None: # if model_clf is DataParallel, then we use model_clf.module model_to_track = model_clf.module if hasattr(model_clf, 'module') else model_clf ema_tracker(model_to_track.named_parameters()) # Whenever we do update, the do ema update if update_step % eval_iter_num == 0: print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) if do_ema and ema_model_copy is not None and ema_tracker is not None: print("EMA evaluation.") EMA.load_ema_to_model(ema_model_copy, ema_tracker) ema_model_copy.to(device) if n_gpu > 1: ema_model_copy = nn.DataParallel(ema_model_copy) dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device) else: dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device) eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list, common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"), mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}") if best_fever_score < fever_score: print("New Best FScore") best_fever_score = fever_score save_path = os.path.join( file_path_prefix, f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})' ) model_to_save = model_clf.module if hasattr(model_clf, 'module') else model_clf output_model_file = os.path.join(file_path_prefix, save_path) torch.save(model_to_save.state_dict(), output_model_file) print("Update steps:", update_step) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) if do_ema and ema_model_copy is not None and ema_tracker is not None: print("EMA evaluation.") EMA.load_ema_to_model(ema_model_copy, ema_tracker) ema_model_copy.to(device) if n_gpu > 1: ema_model_copy = nn.DataParallel(ema_model_copy) dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device) else: dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device) eval_mode = {'check_sent_id_correct': True, 'standard': True} fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list, common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"), mode=eval_mode, verbose=False) print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1) print(f"Dev:{fever_score}/{label_score}") if best_fever_score < fever_score: print("New Best FScore") best_fever_score = fever_score save_path = os.path.join( file_path_prefix, f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})' ) model_to_save = model_clf.module if hasattr(model_clf, 'module') else model_clf output_model_file = os.path.join(file_path_prefix, save_path) torch.save(model_to_save.state_dict(), output_model_file)
if p_item['label'] == 'false' and not p_item['in_sp_doc']: p_v = np.random.rand() if p_v < selection_prob: r_list.append(p_item) elif p_item['label'] == 'false' and p_item['in_sp_doc']: p_v = np.random.rand() if p_v < same_doc_prob: r_list.append(p_item) else: r_list.append(p_item) return r_list if __name__ == '__main__': train_list = common.load_json(config.TRAIN_FILE) # train_list = common.load_json(config.DEV_FULLWIKI_FILE) # train_list = common.load_json(config.DEV_DISTRACTOR_FILE) # print(len(train_list)) train_sent_data_list = build_sent_match_data_from_distractor_list( train_list, is_training=True) print(len(train_sent_data_list)) train_sent_data_list = downsample_negative_examples( train_sent_data_list, 0.1, 1) print(len(train_sent_data_list)) neg = 0 pos = 0 in_sp_doc = 0 for p_item in train_sent_data_list: if p_item['label'] == 'true': pos += 1
def doc_retrie_v5_reimpl_tf_idf_upperbound(): top_k = 10 dev_fullwiki = common.load_json(config.DEV_FULLWIKI_FILE) pred_dev = common.load_json( # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_with_hyperlinked_v5_file.json") # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_file.json") config.RESULT_PATH / "doc_retri_results/doc_retrieval_debug_v6/doc_raw_matching_with_disamb_withiout_hyperlinked_v6_file_debug_4.json") # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_withiout_hyperlinked_v5_file.json") tf_idf_dev_results = common.load_jsonl( config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl") tf_idf_scored_dict = dict() for item in tf_idf_dev_results: sorted_scored_list = sorted(item['doc_list'], key=lambda x: x[0], reverse=True) pred_list = [docid for _, docid in sorted_scored_list[:top_k]] qid = item['qid'] tf_idf_scored_dict[qid] = pred_list pred_v5_sp_doc = pred_dev['sp_doc'] # dev_fullwiki = common.load_json(config.DEV_DISTRACTOR_FILE) upperbound_pred_file = dict() upperbound_pred_file['sp'] = dict() upperbound_pred_file['sp_doc'] = dict() upperbound_pred_file['p_answer'] = dict() # print(dev_fullwiki for item in dev_fullwiki: qid = item['_id'] answer = item['answer'] contexts = item['context'] supporting_facts = item['supporting_facts'] tf_idf_docs = tf_idf_scored_dict[qid] v5_retrieved_doc = pred_v5_sp_doc[qid] # print(v5_retrieved_doc) supporting_doc = set([fact[0] for fact in item['supporting_facts']]) # retrieved_doc_dict = set([context[0] for context in contexts]) retrieved_doc_dict = dict() for doc_title, context_sents in contexts: if doc_title not in retrieved_doc_dict: retrieved_doc_dict[doc_title] = dict() for i, sent in enumerate(context_sents): retrieved_doc_dict[doc_title][i] = sent upperbound_pred_doc = [] upperbound_pred_sp = [] found_answer = False for sp_doc in tf_idf_docs: if sp_doc in supporting_doc: upperbound_pred_doc.append(sp_doc) for gt_sp_doc, sp_fact_line_num in supporting_facts: if gt_sp_doc == sp_doc: upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: found_answer = True for sp_doc in v5_retrieved_doc: if sp_doc not in upperbound_pred_doc: if sp_doc in supporting_doc: upperbound_pred_doc.append(sp_doc) for gt_sp_doc, sp_fact_line_num in supporting_facts: if gt_sp_doc == sp_doc: upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: found_answer = True # upperbound_pred_sp.append([sp_doc, sp_fact_line_num]) # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]: # found_answer = True p_answer = answer if found_answer else "" upperbound_pred_file['sp'][qid] = upperbound_pred_sp upperbound_pred_file['sp_doc'][qid] = upperbound_pred_doc upperbound_pred_file['p_answer'][qid] = p_answer if all([gt_fact in upperbound_pred_sp for gt_fact in supporting_facts]): # If we find all the evidence, to add additional yes/no answer. upperbound_pred_file['p_answer'][qid] = answer ext_hotpot_eval.eval(upperbound_pred_file, dev_fullwiki)