def full_eval_model(model, data_iter, criterion, dev_data_list): # SUPPORTS < (-.-) > 0 # REFUTES < (-.-) > 1 # NOT ENOUGH INFO < (-.-) > 2 id2label = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO"} print("Evaluating ...") model.eval() n_correct = loss = 0 totoal_size = 0 y_pred_list = [] y_true_list = [] y_id_list = [] with torch.no_grad(): # Important fixing. for batch_idx, batch in enumerate(data_iter): out = model(batch) y = batch['label'] y_id_list.extend(list(batch['pid'])) n_correct += (torch.max(out, 1)[1].view(y.size()) == y).sum().item() y_pred_list.extend(torch.max(out, 1)[1].view(y.size()).tolist()) y_true_list.extend(y.tolist()) loss += criterion(out, y).item() * y.size(0) totoal_size += y.size(0) assert len(y_id_list) == len(dev_data_list) assert len(y_pred_list) == len(dev_data_list) assert len(y_true_list) == len(dev_data_list) for i in range(len(dev_data_list)): assert str(y_id_list[i]) == str(dev_data_list[i]['id']) # Matching id dev_data_list[i]['predicted_label'] = id2label[y_pred_list[i]] # Reset neural set if len(dev_data_list[i]['predicted_sentids']) == 0: dev_data_list[i]['predicted_label'] = "NOT ENOUGH INFO" # dev_data_list[i]['predicted_evidence'] = convert_evidence2scoring_format(dev_data_list[i]['predicted_sentids']) print('n_correct:', n_correct) print('total_size:', totoal_size) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_data_list, dev_data_list, mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) avg_acc = 100. * n_correct / totoal_size avg_loss = loss / totoal_size return strict_score, avg_loss
def used_func_for_fast_key_word_matching_expanded_kw(): """ Added on July 1. :return: """ # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) # keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl") id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=False) # exit(-2) # Write this in a for loop to keep track of the progress build_flashtext_processor_wit(keyword_processor, id_to_key_dict) # Load data for predicting d_list = load_data(config.FEVER_DEV_JSONL) sample_answer(d_list, tok, keyword_p=keyword_processor) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl' # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl' # d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def conduct_search(r_list, eval_list): best_strict_score = -1 best_acc_score = -1 max = len(r_list) for count in range(1, 5): combo = itertools.combinations(r_list, count) for combo_list in combo: test_list = [x[1] for x in combo_list] test_ind = [x[0] for x in combo_list] nli_results = merge_nli_results(test_list) eval_mode = {'standard': True} delete_unused_evidence(nli_results) strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( nli_results, eval_list, mode=eval_mode, verbose=False) if best_acc_score < acc_score: best_acc_score = acc_score print('-' * 50) print("Best Acc:", best_acc_score) print("Best Acc Ind:", test_ind) print('-' * 50) if best_strict_score < strict_score: best_strict_score = strict_score print('-' * 50) print("Best sAcc:", strict_score) print("Best sAcc Ind:", test_ind) print('-' * 50)
def used_func_for_fast_key_word_matching(): # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl") # Write this in a for loop to keep track of the progress for clean_name, keywords in tqdm(id_to_key_dict.items()): if not isinstance(keywords, list): raise AttributeError("Value of key {} should be a list".format(clean_name)) for keyword in keywords: keyword_processor.add_keyword(keyword, clean_name) # Load data for predicting d_list = load_data(config.FEVER_DEV_JSONL) sample_answer(d_list, tok, keyword_p=keyword_processor) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl' d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode))
def spectrum_eval_manual_check(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-17-12:10:35_mesim_elmo/i(34800)_epoch(5)_dev(0.5563056305630563)_loss(1.6648460462434564)_seed(12)" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15:52:19_r/dev_sent.jsonl" IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16:34:19_r/dev_sent.jsonl" # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16-34-19_r/dev_sent.jsonl" dev_sent_result_lsit = common.load_jsonl(IN_FILE) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) for sc_prob in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]: upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=sc_prob, delete_prob=False) dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) print("------------------------------------") print("Scaling_prob:", sc_prob) eval_mode = {'check_sent_id_correct': True, 'standard': True} print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode)) # del upstream_dev_list # del complete_upstream_dev_data del dev_fever_data_reader del dev_instances print("------------------------------------")
def used_func_for_fast_key_word_matching_prioritized_kw_resample(): """ Added on July 1. :return: """ # Load tokenizer # path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') # drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) # tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) # doc_tokens, doc_lemmas = parse_doc_id('Hourglass_-LRB-James_Taylor_album-RRB-', tok) # print(doc_tokens) # print(doc_lemmas) # print(get_words_inside_parenthese(doc_tokens)) # print(get_words_inside_parenthese(doc_lemmas)) # claim_t = ['album'] # claim_l = ['album'] # print(check_inside_paretheses_overlap(doc_tokens, doc_lemmas, claim_t, claim_l)) # exit(-1) # # keyword_processor = KeywordProcessor(case_sensitive=True) # # id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl", filtering=True) # # exact_match_rule_dict = set_priority(id_to_key_dict, priority=5.0) # print(len(exact_match_rule_dict)) # # noisy_key_dict = id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=True) # noisy_parenthese_rule_dict = set_priority(noisy_key_dict, priority=1.0) # print("Noisy_Parenthese_Rule_Dict:", len(noisy_parenthese_rule_dict)) # exit(-2) # Write this in a for loop to keep track of the progress # build_flashtext_processor_with_prioritized_kw_dict(keyword_processor, exact_match_rule_dict) # build_flashtext_processor_with_prioritized_kw_dict(keyword_processor, noisy_parenthese_rule_dict) # Load data for predicting # d_list = load_data(config.FEVER_TRAIN_JSONL) # d_list = load_data(config.FEVER_DEV_JSONL) # sample_answer_with_priority(d_list, tok, keyword_processor, top_k=5) # save the the results for evaluating # out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "train.jsonl" # out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" out_fname = '/Users/Eason/RA/FunEver/results/sent_retri/2018_07_05_17:17:50_r/dev.jsonl' d_list = load_data(out_fname) resample_answer_with_priority(d_list, top_k=5) # save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl' # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl' # d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def hidden_eval_fever(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)" dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) # dev_biterator = BasicIterator(batch_size=batch_size * 2) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 300), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} for item in builded_dev_data: del item['label'] print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
def error_analysis(out_fname): d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print( fever_score(d_list, d_list, mode=eval_mode, verbose=True, error_analysis_file=Path(out_fname).parent / "analysis.log"))
def prepare_data_only_page_view(tokenized_file, eval_file, doc_retrieval_output_file): """ This method prepare document retrieval data using only page view. :return: """ doc_retrieval_method = 'pageview' print("Method:", doc_retrieval_method) haonan_docretri_object = HAONAN_DOCRETRI_OBJECT() doc_retrieval_result_list = first_doc_retrieval( haonan_docretri_object, tokenized_file, method=doc_retrieval_method, top_k=100) eval_list = common.load_jsonl(eval_file) disamb.item_resorting(doc_retrieval_result_list) print("Evaluating 1st Doc Retrieval") eval_mode = {'check_doc_id_correct': True, 'standard': False} print( c_scorer.fever_score(doc_retrieval_result_list, eval_list, mode=eval_mode, verbose=False)) print( "Max_doc_num_5:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=5)) print( "Max_doc_num_10:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=10)) print( "Max_doc_num_15:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=15)) print( "Max_doc_num_20:", c_scorer.fever_doc_only(doc_retrieval_result_list, eval_list, max_evidence=20)) # First Document retrieval End. common.save_jsonl(doc_retrieval_result_list, doc_retrieval_output_file)
def hidden_eval_fever(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-08-19:04:33_mesim_elmo/i(39700)_epoch(6)_dev(0.5251525152515252)_loss(1.5931938096682707)_seed(12)" dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode))
def if_idf_select_sentence(): db_cursor = fever_db.get_cursor() loaded_path = "/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/dev.jsonl" d_list = load_data(loaded_path) # d_list = load_data("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl") for item in tqdm(d_list): # print() p_docids = item['predicted_docids'] cleaned_claim = ' '.join(easy_tokenize(item['claim'])) # print(cleaned_claim) current_sent_list = [] current_id_list = [] for doc_id in p_docids: r_list, id_list = fever_db.get_all_sent_by_doc_id( db_cursor, doc_id) current_sent_list.extend(r_list) current_id_list.extend(id_list) Args = namedtuple('Args', 'ngram hash_size num_workers') args = Args(2, int(8192), 4) ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram, current_sent_list) selected_index, selected_score = ranker.closest_docs(cleaned_claim, k=5) selected_sent_id = [] for ind in selected_index: curent_selected = current_id_list[ind] doc_id, ln = curent_selected.split('(-.-)') # ln = int(ln) # selected_sent_id.append([doc_id, ln]) selected_sent_id.append(doc_id + c_scorer.SENT_LINE + ln) item['predicted_sentids'] = selected_sent_id eval_mode = {'check_sent_id_correct': True, 'standard': False} print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False)) out_fname = config.RESULT_PATH / "sent_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname, last_loaded_path=loaded_path)
def hidden_eval_fever_adv_v1(): batch_size = 64 lazy = True dev_prob_threshold = 0.5 SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-20-22:28:24_mesim_wn_450_adv_sample_v1_|t_prob:0.35|top_k:8/i(46000)_epoch(7)_dev(0.6405140514051405)_loss(1.0761665150348825)_seed(12)" dev_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/2018_07_20_15:17:59_r/dev_sent.jsonl") # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_upstream_sent_list, scale_prob=dev_prob_threshold, delete_prob=False) dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model( rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} common.save_jsonl( builded_dev_data, config.RESULT_PATH / "nli_results" / "pipeline_results_1.jsonl") c_scorer.delete_label(builded_dev_data) print( c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.FEVER_DEV_JSONL), mode=eval_mode))
def analysis_model(model_path): batch_size = 32 lazy = True train_prob_threshold = 0.02 train_sample_top_k = 8 dev_prob_threshold = 0.1 dev_sample_top_k = 5 neg_sample_upper_prob = 0.006 decay_r = 0.002 top_k_doc = 5 dev_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_doc_upstream_file, pred=True, top_k=top_k_doc) print("Dev size:", len(complete_upstream_dev_data)) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } # Data Reader dev_fever_data_reader = VCSS_Reader(token_indexers=token_indexers, lazy=lazy, max_l=260) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.add_token_to_namespace('true', namespace='labels') vocab.add_token_to_namespace('false', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Reader and prepare end # vc_ss_training_sampler = VCSSTrainingSampler(complete_upstream_train_data) # vc_ss_training_sampler.show_info() # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + 1, 1024 + 450 + 1), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.display() model.to(device) cloned_empty_model = copy.deepcopy(model) load_ema_to_model(cloned_empty_model, model_path) vc_ss.data_wrangler.assign_task_label(complete_upstream_dev_data, 'ss') dev_ss_instance = dev_fever_data_reader.read(complete_upstream_dev_data) eval_ss_iter = biterator(dev_ss_instance, num_epochs=1, shuffle=False) scored_dev_sent_data = hidden_eval_ss(cloned_empty_model, eval_ss_iter, complete_upstream_dev_data) common.save_jsonl(scored_dev_sent_data, "dev_scored_sent_data.jsonl") # for vc filtered_dev_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei( config.T_FEVER_DEV_JSONL, scored_dev_sent_data, dev_prob_threshold, dev_sample_top_k) common.save_jsonl(filtered_dev_list, "dev_scored_sent_data_after_sample.jsonl") dev_selection_dict = paired_selection_score_dict(scored_dev_sent_data) ready_dev_list = select_sent_with_prob_for_eval(config.T_FEVER_DEV_JSONL, filtered_dev_list, dev_selection_dict, tokenized=True) vc_ss.data_wrangler.assign_task_label(ready_dev_list, 'vc') dev_vc_instance = dev_fever_data_reader.read(ready_dev_list) eval_vc_iter = biterator(dev_vc_instance, num_epochs=1, shuffle=False) eval_dev_result_list = hidden_eval_vc(cloned_empty_model, eval_vc_iter, ready_dev_list) common.save_jsonl(eval_dev_result_list, "dev_nli_results.jsonl") # Scoring eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( eval_dev_result_list, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) print(f"Dev:{strict_score}/{acc_score}")
def train_fever_std_ema_v1(resume_model=None, do_analysis=False): """ This method is created on 26 Nov 2018 08:50 with the purpose of training vc and ss all together. :param resume_model: :param wn_feature: :return: """ num_epoch = 200 seed = 12 batch_size = 32 lazy = True train_prob_threshold = 0.02 train_sample_top_k = 8 dev_prob_threshold = 0.1 dev_sample_top_k = 5 top_k_doc = 5 schedule_sample_dict = defaultdict(lambda: 0.1) ratio_ss_for_vc = 0.2 schedule_sample_dict.update({ 0: 0.1, 1: 0.1, # 200k + 400K 2: 0.1, 3: 0.1, # 200k + 200k ~ 200k + 100k 4: 0.1, 5: 0.1, # 200k + 100k 6: 0.1 # 20k + 20k }) # Eval at beginning of the training. eval_full_epoch = 1 eval_nei_epoches = [2, 3, 4, 5, 6, 7] neg_only = False debug = False experiment_name = f"vc_ss_v17_ratio_ss_for_vc:{ratio_ss_for_vc}|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}_scheduled_neg_sampler" # resume_model = None print("Do EMA:") print("Dev prob threshold:", dev_prob_threshold) print("Train prob threshold:", train_prob_threshold) print("Train sample top k:", train_sample_top_k) # Get upstream sentence document retrieval data dev_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" train_doc_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl" complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_doc_upstream_file, pred=True, top_k=top_k_doc) complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL, train_doc_upstream_file, pred=False, top_k=top_k_doc) if debug: complete_upstream_dev_data = complete_upstream_dev_data[:1000] complete_upstream_train_data = complete_upstream_train_data[:1000] print("Dev size:", len(complete_upstream_dev_data)) print("Train size:", len(complete_upstream_train_data)) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } # Data Reader dev_fever_data_reader = VCSS_Reader(token_indexers=token_indexers, lazy=lazy, max_l=260) train_fever_data_reader = VCSS_Reader(token_indexers=token_indexers, lazy=lazy, max_l=260) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.add_token_to_namespace('true', namespace='labels') vocab.add_token_to_namespace('false', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Reader and prepare end vc_ss_training_sampler = VCSSTrainingSampler(complete_upstream_train_data) vc_ss_training_sampler.show_info() # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + 1, 1024 + 450 + 1), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300, num_of_class=4) print("Model Max length:", model.max_l) if resume_model is not None: model.load_state_dict(torch.load(resume_model)) model.display() model.to(device) cloned_empty_model = copy.deepcopy(model) ema: EMA = EMA(parameters=model.named_parameters()) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() analysis_dir = None if do_analysis: analysis_dir = Path(file_path_prefix) / "analysis_aux" analysis_dir.mkdir() # Save source code end. # Staring parameter setup best_dev = -1 iteration = 0 start_lr = 0.0001 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() # parameter setup end for i_epoch in range(num_epoch): print("Resampling...") # This is for train # This is for sample candidate data for from result of ss for vc. # This we will need to do after each epoch. if i_epoch == eval_full_epoch: # only eval at 1 print("We now need to eval the whole training set.") print("Be patient and hope good luck!") load_ema_to_model(cloned_empty_model, ema) eval_sent_for_sampler(cloned_empty_model, token_indexers, vocab, vc_ss_training_sampler) elif i_epoch in eval_nei_epoches: # at 2, 3, 4 eval for NEI print("We now need to eval the NEI training set.") print("Be patient and hope good luck!") load_ema_to_model(cloned_empty_model, ema) eval_sent_for_sampler(cloned_empty_model, token_indexers, vocab, vc_ss_training_sampler, nei_only=True) train_data_with_candidate_sample_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei( config.T_FEVER_TRAIN_JSONL, vc_ss_training_sampler.sent_list, train_prob_threshold, train_sample_top_k) # We initialize the prob for each sentence so the sampler can work, but we will need to run the model for dev data to work. train_selection_dict = paired_selection_score_dict( vc_ss_training_sampler.sent_list) cur_train_vc_data = adv_simi_sample_with_prob_v1_1( config.T_FEVER_TRAIN_JSONL, train_data_with_candidate_sample_list, train_selection_dict, tokenized=True) if do_analysis: # Customized analysis output common.save_jsonl( vc_ss_training_sampler.sent_list, analysis_dir / f"E_{i_epoch}_whole_train_sent_{save_tool.get_cur_time_str()}.jsonl" ) common.save_jsonl( train_data_with_candidate_sample_list, analysis_dir / f"E_{i_epoch}_sampled_train_sent_{save_tool.get_cur_time_str()}.jsonl" ) common.save_jsonl( cur_train_vc_data, analysis_dir / f"E_{i_epoch}_train_vc_data_{save_tool.get_cur_time_str()}.jsonl" ) print(f"E{i_epoch} VC_data:", len(cur_train_vc_data)) # This is for sample negative candidate data for ss # After sampling, we decrease the ratio. neg_sample_upper_prob = schedule_sample_dict[i_epoch] print("Neg Sampler upper rate:", neg_sample_upper_prob) # print("Rate decreasing") # neg_sample_upper_prob -= decay_r neg_sample_upper_prob = max(0.000, neg_sample_upper_prob) cur_train_ss_data = vc_ss_training_sampler.sample_for_ss( neg_only=neg_only, upper_prob=neg_sample_upper_prob) if i_epoch >= 1: # if epoch num >= 6 we balance pos and neg example for selection # new_ss_data = [] pos_ss_data = [] neg_ss_data = [] for item in cur_train_ss_data: if item['selection_label'] == 'true': pos_ss_data.append(item) elif item['selection_label'] == 'false': neg_ss_data.append(item) ss_sample_size = min(len(pos_ss_data), len(neg_ss_data)) random.shuffle(pos_ss_data) random.shuffle(neg_ss_data) cur_train_ss_data = pos_ss_data[:int( ss_sample_size * 0.5)] + neg_ss_data[:ss_sample_size] random.shuffle(cur_train_ss_data) vc_ss_training_sampler.show_info(cur_train_ss_data) print(f"E{i_epoch} SS_data:", len(cur_train_ss_data)) vc_ss.data_wrangler.assign_task_label(cur_train_ss_data, 'ss') vc_ss.data_wrangler.assign_task_label(cur_train_vc_data, 'vc') vs_ss_train_list = cur_train_ss_data + cur_train_vc_data random.shuffle(vs_ss_train_list) print(f"E{i_epoch} Total ss+vc:", len(vs_ss_train_list)) vc_ss_instance = train_fever_data_reader.read(vs_ss_train_list) train_iter = biterator(vc_ss_instance, shuffle=True, num_epochs=1) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) if i_epoch >= 1: ratio_ss_for_vc = 0.8 loss = compute_mixing_loss( model, out, batch, criterion, vc_ss_training_sampler, ss_for_vc_prob=ratio_ss_for_vc) # Important change # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 # EMA update ema(model.named_parameters()) if i_epoch < 9: mod = 10000 # mod = 100 else: mod = 2000 if iteration % mod == 0: # This is the code for eval: load_ema_to_model(cloned_empty_model, ema) vc_ss.data_wrangler.assign_task_label( complete_upstream_dev_data, 'ss') dev_ss_instance = dev_fever_data_reader.read( complete_upstream_dev_data) eval_ss_iter = biterator(dev_ss_instance, num_epochs=1, shuffle=False) scored_dev_sent_data = hidden_eval_ss( cloned_empty_model, eval_ss_iter, complete_upstream_dev_data) # for vc filtered_dev_list = vc_ss.data_wrangler.sample_sentences_for_vc_with_nei( config.T_FEVER_DEV_JSONL, scored_dev_sent_data, dev_prob_threshold, dev_sample_top_k) dev_selection_dict = paired_selection_score_dict( scored_dev_sent_data) ready_dev_list = select_sent_with_prob_for_eval( config.T_FEVER_DEV_JSONL, filtered_dev_list, dev_selection_dict, tokenized=True) vc_ss.data_wrangler.assign_task_label(ready_dev_list, 'vc') dev_vc_instance = dev_fever_data_reader.read(ready_dev_list) eval_vc_iter = biterator(dev_vc_instance, num_epochs=1, shuffle=False) eval_dev_result_list = hidden_eval_vc(cloned_empty_model, eval_vc_iter, ready_dev_list) # Scoring eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( eval_dev_result_list, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) print(f"Dev:{strict_score}/{acc_score}") if do_analysis: # Customized analysis output common.save_jsonl( scored_dev_sent_data, analysis_dir / f"E_{i_epoch}_scored_dev_sent_{save_tool.get_cur_time_str()}.jsonl" ) common.save_jsonl( eval_dev_result_list, analysis_dir / f"E_{i_epoch}_eval_vc_output_data_{save_tool.get_cur_time_str()}.jsonl" ) need_save = False if strict_score > best_dev: best_dev = strict_score need_save = True if need_save or i_epoch < 7: # save_path = os.path.join( # file_path_prefix, # f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})' # ) # torch.save(model.state_dict(), save_path) ema_save_path = os.path.join( file_path_prefix, f'ema_i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_p({pr})_r({rec})_f1({f1})_seed({seed})' ) save_ema_to_file(ema, ema_save_path)
def eval_and_save_v2(model_path, is_ema, saving_dir, save_train_data=True, prob_thresholds=0.5): # This method was modified on 21 NOV 2018 # for evaluating balanced trained selection model with different threshold value. # It will then be used for later verification. # Evaluate and Save all the sentence pairs results to be used for downstream verificaion # 03 Oct 2018 03:56:40. seed = 12 batch_size = 128 lazy = True torch.manual_seed(seed) keep_neg_sample_prob = 1 top_k_doc = 5 # sample_prob_decay = 0.05 dev_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" train_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True, top_k=top_k_doc) complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False, top_k=top_k_doc) print("Dev size:", len(complete_upstream_dev_data)) print("Train size:", len(complete_upstream_train_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) train_instances = train_fever_data_reader.read( complete_upstream_train_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=160, num_of_class=2) if not is_ema: model.load_state_dict(torch.load(model_path)) else: load_ema_to_model(model, model_path) model.display() model.to(device) dev_actual_list = common.load_jsonl(config.T_FEVER_DEV_JSONL) train_actual_list = common.load_jsonl(config.T_FEVER_TRAIN_JSONL) eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1) train_iter = biterator(train_instances, shuffle=False, num_epochs=1) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) if save_train_data: complete_upstream_train_data = hidden_eval( model, train_iter, complete_upstream_train_data) common.save_jsonl(complete_upstream_train_data, Path(str(saving_dir)) / "train_sent_scores.jsonl") common.save_jsonl(complete_upstream_dev_data, Path(str(saving_dir)) / "dev_sent_pred_scores.jsonl") if not isinstance(prob_thresholds, list): prob_thresholds = [prob_thresholds] for scal_prob in prob_thresholds: print("Eval Dev Data prob_threshold:", scal_prob) dev_results_list = score_converter_v1(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data, sent_retri_top_k=5, sent_retri_scal_prob=scal_prob) # This is only a wrapper for the simi_sampler eval_mode = {'check_sent_id_correct': True, 'standard': True} for a, b in zip(dev_actual_list, dev_results_list): b['predicted_label'] = a['label'] strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, dev_actual_list, mode=eval_mode, verbose=False) tracking_score = strict_score print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") if save_train_data: print("Build Train Data") train_results_list = score_converter_v1( config.T_FEVER_TRAIN_JSONL, complete_upstream_train_data, sent_retri_top_k=5, sent_retri_scal_prob=prob_threshold) # This is only a wrapper for the simi_sampler eval_mode = {'check_sent_id_correct': True, 'standard': True} for a, b in zip(train_actual_list, train_results_list): b['predicted_label'] = a['label'] strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( train_results_list, train_actual_list, mode=eval_mode, verbose=False) tracking_score = strict_score print(f"Train(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}")
def full_eval_model_hesm(hesm_model, model, dataloader, criterion, dev_data_list): id2label = { 0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO" } print("Evaluating ...") model.eval() n_correct = 0 total_size = 0 loss = 0 y_pred_list = [] y_pred_mult_list = [] y_true_list = [] y_id_list = [] with torch.no_grad(): # Important fixing. for batch in dataloader: curloss, out, multiout = hesm_model.step(batch) y = batch['labels'].cuda() y_id_list.extend(list(batch['pid'])) max_index = torch.max(out, 1)[1] n_correct += (max_index.view(y.size()) == y).sum().item() total_size += y.size(0) y_pred_list.extend(max_index.view(y.size()).tolist()) y_true_list.extend(y.tolist()) loss += curloss.mean() if multiout is not None: cur_s_label = [] for sout in multiout: cur_s_label.append(torch.max(sout, 1)[1].tolist()) cur_s_label = list(zip(*cur_s_label)) y_pred_mult_list.extend(cur_s_label) assert len(y_id_list) == len(dev_data_list) assert len(y_pred_list) == len(dev_data_list) assert len(y_true_list) == len(dev_data_list) for i in range(len(dev_data_list)): assert str(y_id_list[i]) == str(dev_data_list[i]['id']) dev_data_list[i]['predicted_label'] = id2label[y_pred_list[i]] if len(y_pred_mult_list) > 0: dev_data_list[i]['multi_predicted_label'] = [id2label[x] for x in y_pred_mult_list[i]] if len(dev_data_list[i]['predicted_sentids']) == 0: dev_data_list[i]['predicted_label'] = "NOT ENOUGH INFO" print('n_correct:', n_correct) print('total_size:', total_size) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_data_list, dev_data_list, mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) avg_loss = loss / total_size return strict_score, avg_loss
def debug_fever(): num_epoch = 8 seed = 12 batch_size = 128 experiment_name = "simple_nn" lazy = True torch.manual_seed(seed) keep_neg_sample_prob = 0.6 sample_prob_decay = 0.1 dev_upstream_file = config.RESULT_PATH / "doc_retri/cn_util_Jul17_docretri.singularize/dev.jsonl" train_upstream_file = config.RESULT_PATH / "doc_retri/cn_util_Jul17_docretri.singularize/train.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=300) # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False) dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=300) complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=280, num_of_class=2) model.display() model.to(device) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 i_epoch = 0 start_lr = 0.0002 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) dev_results_list = score_converter_v0(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, config.T_FEVER_DEV_JSONL, mode=eval_mode, verbose=False) total = len(dev_results_list) hit = eval_mode['check_sent_id_correct_hits'] tracking_score = hit / total print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") need_save = False if tracking_score > best_dev: best_dev = tracking_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})' ) torch.save(model.state_dict(), save_path) print("Epoch Evaluation...") eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) dev_results_list = score_converter_v0(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, config.T_FEVER_DEV_JSONL, mode=eval_mode, verbose=False) total = len(dev_results_list) hit = eval_mode['check_sent_id_correct_hits'] tracking_score = hit / total print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") if tracking_score > best_dev: best_dev = tracking_score save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})_epoch' ) torch.save(model.state_dict(), save_path)
def utest_score_ground_truth(): d_list = load_data(config.FEVER_DEV_JSONL) utest_for_ground_truth(d_list) eval_mode = {'check_sent_id_correct': True, 'standard': True} print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def check_acc(in_path): d_list = load_data(in_path) eval_mode = {'check_sent_id_correct': True, 'standard': False} print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def train_fever_hesm(model_name = "albert-base-v2"): seed = 12 torch.manual_seed(seed) num_epoch = 4 batch_size = 64 # parameters for annealed sampling keep_neg_sample_prob = 1 sample_prob_decay = 0.015 min_keep_neg_sample_prob = 0.02 experiment_name = "simple_nn_startkp_{}_de_{}".format(keep_neg_sample_prob, sample_prob_decay) resume_model = None dev_upstream_file = config.RESULT_PATH / "pipeline_r_aaai_doc_exec/2019_10_07_10:14:16_r/doc_retr_2_shared_task_dev.jsonl" train_upstream_file = config.RESULT_PATH / "pipeline_r_aaai_doc/2019_10_27_16:48:33_r/doc_retr_2_train.jsonl" complete_upstream_dev_data = get_hyperlink_evidence_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True) print("Dev size:", len(complete_upstream_dev_data)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels = 2, output_attentions = False, output_hidden_states = False, ) if torch.cuda.device_count() > 1: print("More than 1 gpu device found...") model = nn.DataParallel(model) model.to(device) start_lr = 2e-5 optimizer = AdamW(model.parameters(), lr = start_lr, eps = 1e-8 ) if resume_model is not None: print("Resume From:", resume_model) load_model(resume_model, model, optimizer) # Create Log File file_path_prefix, _ = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 criterion = nn.CrossEntropyLoss() hesm_model = HESMUtil(model, model_name=model_name) display(model) for i_epoch in range(num_epoch): print("Get first evidence for training...") complete_upstream_train_data = get_hyperlink_evidence_list(config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False) print("Resampling...") print("Sample Prob.:", keep_neg_sample_prob) filtered_train_data = post_filter_v2(complete_upstream_train_data, keep_prob=keep_neg_sample_prob, seed=12 + i_epoch) keep_neg_sample_prob -= sample_prob_decay if keep_neg_sample_prob <= min_keep_neg_sample_prob: keep_neg_sample_prob = min_keep_neg_sample_prob print("Sampled length:", len(filtered_train_data)) sent_list, label_list, pid_list = hesm_model.read(filtered_train_data) train_dataset = HESMDataset({'text': sent_list, 'labels': label_list, 'pid': pid_list}) train_dataloader = DataLoader( train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size ) if i_epoch == 0: steps_per_epoch = len(train_dataloader) total_steps = steps_per_epoch * num_epoch scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps) accumulation_steps = 2 # accumulate gradients for increasing `batch_size` by a factor of `accumulation_steps` save_epoch = 0.5 # evaluate and save every `save_epoch` epochs optimizer.zero_grad() for i, batch in tqdm(enumerate(train_dataloader)): model.train() loss, out = hesm_model.step(batch) y = batch['labels'].cuda() loss = criterion(out, y) loss = loss / accumulation_steps loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (i+1) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step scheduler.step() optimizer.zero_grad() iteration += 1 mod = steps_per_epoch * save_epoch if iteration % mod == 0: sent_list, label_list, pid_list = hesm_model.read(complete_upstream_dev_data) eval_dataset = HESMDataset({'text': sent_list, 'labels': label_list, 'pid': pid_list}) eval_dataloader = DataLoader( eval_dataset, sampler = SequentialSampler(eval_dataset), batch_size = batch_size ) complete_upstream_dev_data = hidden_eval_hesm(hesm_model, model, eval_dataloader, complete_upstream_dev_data) dev_results_list = score_converter(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data, dev_upstream_file) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) total = len(dev_results_list) hit = eval_mode['check_sent_id_correct_hits'] tracking_score = hit / total print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}/") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") need_save = False if tracking_score > best_dev: best_dev = tracking_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})' ) save_model(save_path, model, optimizer)
def eval(self, d_list): eval_mode = {'check_doc_id_correct': True, 'standard': False} return fever_score(d_list, d_list, mode=eval_mode, verbose=False)
def train_fever(): num_epoch = 8 seed = 12 batch_size = 128 experiment_name = "simple_nn" lazy = True torch.manual_seed(seed) keep_neg_sample_prob = 0.5 sample_prob_decay = 0.1 dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" train_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/train.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False) dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy) complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) model.display() model.to(device) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 start_lr = 0.0002 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() for i_epoch in range(num_epoch): print("Resampling...") # Resampling complete_upstream_train_data = get_full_list(config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False) filtered_train_data = post_filter(complete_upstream_train_data, keep_prob=keep_neg_sample_prob, seed=12 + i_epoch) # Change the seed to avoid duplicate sample... keep_neg_sample_prob -= sample_prob_decay print("Sampled_length:", len(filtered_train_data)) sampled_train_instances = train_fever_data_reader.read(filtered_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) y = batch['selection_label'] loss = criterion(out, y) # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 if i_epoch <= 4: mod = 25000 else: mod = 10000 if iteration % mod == 0: eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_score, dev_loss, complete_upstream_dev_data = full_eval_model(model, eval_iter, criterion, complete_upstream_dev_data) dev_results_list = score_converter_v0(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(dev_results_list, config.T_FEVER_DEV_JSONL, mode=eval_mode, verbose=False) total = len(dev_results_list) hit = eval_mode['check_sent_id_correct_hits'] tracking_score = hit / total print(f"Dev(clf_acc/pr/rec/f1/loss):{dev_score}/{pr}/{rec}/{f1}/{dev_loss}") print(f"Tracking score:", f"{tracking_score}") need_save = False if tracking_score > best_dev: best_dev = tracking_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|clf_acc:{dev_score}|pr:{pr}|rec:{rec}|f1:{f1}|loss:{dev_loss})' ) torch.save(model.state_dict(), save_path)
common.save_jsonl( dev_results_list, config.RESULT_PATH / "sent_retri_nn/2018_07_20_15-17-59_r/dev_scale(0.1).jsonl") # for item in dev_results_list: # print(item['scored_sentids']) # common.save_jsonl(dev_results_list, "/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") # common.save_jsonl(dev_results_list, "/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") # eval_mode = {'check_doc_id_correct': True, 'check_sent_id_correct': True, 'standard': True} eval_mode = {'check_sent_id_correct': True, 'standard': True} # c_scorer.delete_label(dev_results_list) strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, common.load_jsonl(config.FEVER_DEV_UNLABELED_JSONL), mode=eval_mode, verbose=False) print(strict_score, acc_score, pr, rec, f1) # total = len(dev_results_list) # hit = eval_mode['check_sent_id_correct_hits'] # tracking_score = hit / total # # print(f"Dev(fever_score/pr/rec/f1):{strict_score}/{pr}/{rec}/{f1}") # print(f"Tracking score:", f"{tracking_score}") # eval_mode = {'check_sent_id_correct': True, 'standard': True} # delete_gold_label(dev_results_list) # strict_score, acc_score, pr, rec, f1, error_list = c_scorer.fever_score_analysis(dev_results_list, # common.load_jsonl(config.T_FEVER_DEV_JSONL), # mode=eval_mode, verbose=False)
def train_fever_ema_v1(resume_model=None): """ This method is training script for bert+nsmn model :param resume_model: :return: """ num_epoch = 200 seed = 12 batch_size = 32 lazy = True dev_prob_threshold = 0.02 train_prob_threshold = 0.02 train_sample_top_k = 8 experiment_name = f"bert_nsmn_ema_lr1|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}" bert_type_name = "bert-large-uncased" bert_servant = BertServant(bert_type_name=bert_type_name) # print("Do EMA:") print("Dev prob threshold:", dev_prob_threshold) print("Train prob threshold:", train_prob_threshold) print("Train sample top k:", train_sample_top_k) dev_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl" ) train_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl" ) # Prepare Data # 22 Nov 2018 03:16 # Remove this because everything can be handled by Bert Servant. print("Building Prob Dicts...") train_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl" ) dev_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl" ) selection_dict = paired_selection_score_dict(train_sent_list) selection_dict = paired_selection_score_dict(dev_sent_list, selection_dict) upstream_dev_list = threshold_sampler_insure_unique( config.T_FEVER_DEV_JSONL, dev_upstream_sent_list, prob_threshold=dev_prob_threshold, top_n=5) dev_fever_data_reader = BertReader(bert_servant, lazy=lazy, max_l=60) train_fever_data_reader = BertReader(bert_servant, lazy=lazy, max_l=60) complete_upstream_dev_data = select_sent_with_prob_for_eval( config.T_FEVER_DEV_JSONL, upstream_dev_list, selection_dict, tokenized=True) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary, if we are using bert, we don't need anything here. biterator = BasicIterator(batch_size=batch_size) unk_token_num = {'tokens': 2600} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') print(vocab) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 bert_servant.bert_model.to(device) # Init model here model = Model( bert_servant, bert_batch_size=1, rnn_size_in=(1024 + 2, 1024 + 2 + 300), # probs + task indicator. rnn_size_out=(300, 300), max_l=250, mlp_d=300, num_of_class=3, drop_r=0.5, activation_type='gelu') model.to(device) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 # start_lr = 0.0001 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() for i_epoch in range(num_epoch): print("Resampling...") # Resampling train_data_with_candidate_sample_list = \ threshold_sampler_insure_unique(config.T_FEVER_TRAIN_JSONL, train_upstream_sent_list, train_prob_threshold, top_n=train_sample_top_k) complete_upstream_train_data = adv_simi_sample_with_prob_v1_1( config.T_FEVER_TRAIN_JSONL, train_data_with_candidate_sample_list, selection_dict, tokenized=True) random.shuffle(complete_upstream_train_data) print("Sample data length:", len(complete_upstream_train_data)) sampled_train_instances = train_fever_data_reader.read( complete_upstream_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) y = batch['label'].to(next(model.parameters()).device) loss = criterion(out, y) # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 # EMA update # ema(model.named_parameters()) if i_epoch < 15: mod = 20000 # mod = 500 else: mod = 2000 if iteration % mod == 0: eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1) complete_upstream_dev_data = hidden_eval( model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( complete_upstream_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) print(f"Dev:{strict_score}/{acc_score}") # EMA saving # eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) # load_ema_to_model(cloned_empty_model, ema) # complete_upstream_dev_data = hidden_eval(cloned_empty_model, eval_iter, complete_upstream_dev_data) # # eval_mode = {'check_sent_id_correct': True, 'standard': True} # strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(complete_upstream_dev_data, # common.load_jsonl(config.T_FEVER_DEV_JSONL), # mode=eval_mode, # verbose=False) # print("Fever Score EMA(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) # # print(f"Dev EMA:{strict_score}/{acc_score}") need_save = False if strict_score > best_dev: best_dev = strict_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})' ) torch.save(model.state_dict(), save_path)
def train_fever_v2(): # train_fever_v1 is the old training script. # train_fever_v2 is the new training script created on 02 Oct 2018 11:40:24. # Here we keep the negative and positive portion to be consistent. num_epoch = 10 seed = 12 batch_size = 128 lazy = True torch.manual_seed(seed) keep_neg_sample_prob = 1 top_k_doc = 5 experiment_name = f"simple_nn_remain_{keep_neg_sample_prob}" # sample_prob_decay = 0.05 dev_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/dev_doc.jsonl" train_upstream_file = config.RESULT_PATH / "doc_retri/std_upstream_data_using_pageview/train_doc.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False) dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) complete_upstream_dev_data = get_full_list(config.T_FEVER_DEV_JSONL, dev_upstream_file, pred=True, top_k=top_k_doc) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=160, num_of_class=2) model.display() model.to(device) cloned_empty_model = copy.deepcopy(model) ema: EMA = EMA(parameters=model.named_parameters()) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 start_lr = 0.0002 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() dev_actual_list = common.load_jsonl(config.T_FEVER_DEV_JSONL) for i_epoch in range(num_epoch): print("Resampling...") # Resampling complete_upstream_train_data = get_full_list( config.T_FEVER_TRAIN_JSONL, train_upstream_file, pred=False, top_k=top_k_doc) print("Sample Prob.:", keep_neg_sample_prob) filtered_train_data = post_filter(complete_upstream_train_data, keep_prob=keep_neg_sample_prob, seed=12 + i_epoch) # Change the seed to avoid duplicate sample... # keep_neg_sample_prob -= sample_prob_decay # if keep_neg_sample_prob <= 0: # keep_neg_sample_prob = 0.005 print("Sampled_length:", len(filtered_train_data)) sampled_train_instances = train_fever_data_reader.read( filtered_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) y = batch['selection_label'] loss = criterion(out, y) # No decay optimizer.zero_grad() loss.backward() optimizer.step() # Update EMA ema(model.named_parameters()) iteration += 1 if i_epoch <= 5: mod = 8000 else: mod = 8000 if iteration % mod == 0: eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) load_ema_to_model(cloned_empty_model, ema) # complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) # Only eval EMA complete_upstream_dev_data = hidden_eval( cloned_empty_model, eval_iter, complete_upstream_dev_data) dev_results_list = score_converter_v1( config.T_FEVER_DEV_JSONL, complete_upstream_dev_data, sent_retri_top_k=5, sent_retri_scal_prob=0.5) # This is only a wrapper for the simi_sampler eval_mode = {'check_sent_id_correct': True, 'standard': True} for a, b in zip(dev_actual_list, dev_results_list): b['predicted_label'] = a['label'] strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, dev_actual_list, mode=eval_mode, verbose=False) tracking_score = strict_score print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") # need_save = False # if tracking_score > best_dev: # best_dev = tracking_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})_ema' ) save_ema_to_file(ema, save_path) # torch.save(model.state_dict(), save_path) print("Epoch Evaluation...") eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) load_ema_to_model(cloned_empty_model, ema) # complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) complete_upstream_dev_data = hidden_eval(cloned_empty_model, eval_iter, complete_upstream_dev_data) dev_results_list = score_converter_v1(config.T_FEVER_DEV_JSONL, complete_upstream_dev_data, sent_retri_top_k=5, sent_retri_scal_prob=0.5) eval_mode = {'check_sent_id_correct': True, 'standard': True} for a, b in zip(dev_actual_list, dev_results_list): b['predicted_label'] = a['label'] strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( dev_results_list, dev_actual_list, mode=eval_mode, verbose=False) tracking_score = strict_score print(f"Dev(raw_acc/pr/rec/f1):{acc_score}/{pr}/{rec}/{f1}") print("Strict score:", strict_score) print(f"Eval Tracking score:", f"{tracking_score}") if tracking_score > best_dev: best_dev = tracking_score save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{tracking_score}|raw_acc:{acc_score}|pr:{pr}|rec:{rec}|f1:{f1})_epoch_ema' ) save_ema_to_file(ema, save_path)
def train_fever_std_ema_v1(resume_model=None, wn_feature=False): """ This method is the new training script for train fever with span and probability score. :param resume_model: :param wn_feature: :return: """ num_epoch = 200 seed = 12 batch_size = 32 lazy = True dev_prob_threshold = 0.1 train_prob_threshold = 0.1 train_sample_top_k = 8 experiment_name = f"nsmn_sent_wise_std_ema_lr1|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}" # resume_model = None print("Do EMA:") print("Dev prob threshold:", dev_prob_threshold) print("Train prob threshold:", train_prob_threshold) print("Train sample top k:", train_sample_top_k) dev_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl" ) train_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl" ) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } print("Building Prob Dicts...") train_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/train_sent_scores.jsonl" ) dev_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/balanced_sentence_selection_results/dev_sent_pred_scores.jsonl" ) selection_dict = paired_selection_score_dict(train_sent_list) selection_dict = paired_selection_score_dict(dev_sent_list, selection_dict) upstream_dev_list = threshold_sampler_insure_unique( config.T_FEVER_DEV_JSONL, dev_upstream_sent_list, prob_threshold=dev_prob_threshold, top_n=5) # Specifiy ablation to remove wordnet and number embeddings. dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=320, ablation=None) train_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=320, shuffle_sentences=False, ablation=None) complete_upstream_dev_data = select_sent_with_prob_for_eval( config.T_FEVER_DEV_JSONL, upstream_dev_list, selection_dict, tokenized=True) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model( rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300, use_extra_lex_feature=False, max_span_l=100) print("Model Max length:", model.max_l) if resume_model is not None: model.load_state_dict(torch.load(resume_model)) model.display() model.to(device) cloned_empty_model = copy.deepcopy(model) ema: EMA = EMA(parameters=model.named_parameters()) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 start_lr = 0.0001 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() for i_epoch in range(num_epoch): print("Resampling...") # Resampling train_data_with_candidate_sample_list = \ threshold_sampler_insure_unique(config.T_FEVER_TRAIN_JSONL, train_upstream_sent_list, train_prob_threshold, top_n=train_sample_top_k) complete_upstream_train_data = adv_simi_sample_with_prob_v1_1( config.T_FEVER_TRAIN_JSONL, train_data_with_candidate_sample_list, selection_dict, tokenized=True) print("Sample data length:", len(complete_upstream_train_data)) sampled_train_instances = train_fever_data_reader.read( complete_upstream_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) y = batch['label'] loss = criterion(out, y) # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 # EMA update ema(model.named_parameters()) if i_epoch < 15: mod = 10000 # mod = 10 else: mod = 2000 if iteration % mod == 0: # eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) # complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) # # eval_mode = {'check_sent_id_correct': True, 'standard': True} # strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score(complete_upstream_dev_data, # common.load_jsonl(config.T_FEVER_DEV_JSONL), # mode=eval_mode, # verbose=False) # print("Fever Score(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) # # print(f"Dev:{strict_score}/{acc_score}") # EMA saving eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) load_ema_to_model(cloned_empty_model, ema) complete_upstream_dev_data = hidden_eval( cloned_empty_model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} strict_score, acc_score, pr, rec, f1 = c_scorer.fever_score( complete_upstream_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode, verbose=False) print("Fever Score EMA(Strict/Acc./Precision/Recall/F1):", strict_score, acc_score, pr, rec, f1) print(f"Dev EMA:{strict_score}/{acc_score}") need_save = False if strict_score > best_dev: best_dev = strict_score need_save = True if need_save: # save_path = os.path.join( # file_path_prefix, # f'i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})' # ) # torch.save(model.state_dict(), save_path) ema_save_path = os.path.join( file_path_prefix, f'ema_i({iteration})_epoch({i_epoch})_dev({strict_score})_lacc({acc_score})_seed({seed})' ) save_ema_to_file(ema, ema_save_path)
def pipeline(in_file, eval_file=None, model_path_dict=default_model_path_dict, steps=default_steps): """ :param in_file: The raw input file. :param eval_file: Whether to provide evaluation along the line. :return: """ sentence_retri_1_scale_prob = 0.5 sentence_retri_2_scale_prob = 0.9 sent_retri_1_top_k = 5 sent_retri_2_top_k = 1 sent_prob_for_2doc = 0.1 sent_topk_for_2doc = 5 enhance_retri_1_scale_prob = -1 build_submission = True doc_retrieval_method = 'word_freq' haonan_docretri_object = HAONAN_DOCRETRI_OBJECT() if not PIPELINE_DIR.exists(): PIPELINE_DIR.mkdir() if steps['s1.tokenizing']['do']: time_stamp = utils.get_current_time_str() current_pipeline_dir = PIPELINE_DIR / f"{time_stamp}_r" else: current_pipeline_dir = steps['s1.tokenizing']['out_file'].parent print("Current Result Root:", current_pipeline_dir) if not current_pipeline_dir.exists(): current_pipeline_dir.mkdir() eval_list = common.load_jsonl(eval_file) if eval_file is not None else None in_file_stem = in_file.stem tokenized_file = current_pipeline_dir / f"t_{in_file_stem}.jsonl" # Save code into directory script_name = os.path.basename(__file__) with open(os.path.join(str(current_pipeline_dir), script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Tokenizing. print("Step 1. Tokenizing.") if steps['s1.tokenizing']['do']: tokenized_claim(in_file, tokenized_file) # Auto Saved print("Tokenized file saved to:", tokenized_file) else: tokenized_file = steps['s1.tokenizing']['out_file'] print("Use preprocessed file:", tokenized_file) # Tokenizing End. # First Document retrieval. print("Step 2. First Document Retrieval") if steps['s2.1doc_retri']['do']: doc_retrieval_result_list = first_doc_retrieval( haonan_docretri_object, tokenized_file, method=doc_retrieval_method) doc_retrieval_file_1 = current_pipeline_dir / f"doc_retr_1_{in_file_stem}.jsonl" common.save_jsonl(doc_retrieval_result_list, doc_retrieval_file_1) print("First Document Retrieval file saved to:", doc_retrieval_file_1) else: doc_retrieval_file_1 = steps['s2.1doc_retri']['out_file'] doc_retrieval_result_list = common.load_jsonl(doc_retrieval_file_1) print("Use preprocessed file:", doc_retrieval_file_1) if eval_list is not None: print("Evaluating 1st Doc Retrieval") eval_mode = {'check_doc_id_correct': True, 'standard': False} print( c_scorer.fever_score(doc_retrieval_result_list, eval_list, mode=eval_mode, verbose=False)) # First Document retrieval End. # First Sentence Selection. print("Step 3. First Sentence Selection") if steps['s3.1sen_select']['do']: dev_sent_list_1_e0 = simple_nnmodel.pipeline_first_sent_selection( tokenized_file, doc_retrieval_file_1, model_path_dict['sselector']) dev_sent_file_1_e0 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}.jsonl" common.save_jsonl(dev_sent_list_1_e0, dev_sent_file_1_e0) # Manual setting, delete it later # dev_sent_file_1_e0 = None # dev_sent_list_1_e0 = common.load_jsonl("/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_selector_em/dev_sent_score_1_shared_task_test.jsonl") # End if steps['s3.1sen_select']['ensemble']: print("Ensemble!") dev_sent_list_1_e1 = simple_nnmodel.pipeline_first_sent_selection( tokenized_file, doc_retrieval_file_1, model_path_dict['sselector_1']) dev_sent_file_1_e1 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_e1.jsonl" common.save_jsonl(dev_sent_list_1_e1, dev_sent_file_1_e1) # exit(0) # dev_sent_list_1_e1 = common.load_jsonl(dev_sent_file_1_e1) dev_sent_list_1_e2 = simple_nnmodel.pipeline_first_sent_selection( tokenized_file, doc_retrieval_file_1, model_path_dict['sselector_2']) dev_sent_file_1_e2 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_e2.jsonl" common.save_jsonl(dev_sent_list_1_e2, dev_sent_file_1_e2) # exit(0) # dev_sent_list_1_e2 = common.load_jsonl(dev_sent_file_1_e2) dev_sent_list_1 = merge_sent_results( [dev_sent_list_1_e0, dev_sent_list_1_e1, dev_sent_list_1_e2]) dev_sent_file_1 = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_ensembled.jsonl" common.save_jsonl(dev_sent_list_1, dev_sent_file_1) # exit(0) else: dev_sent_list_1 = dev_sent_list_1_e0 dev_sent_file_1 = dev_sent_file_1_e0 # Merging two results print("First Sentence Selection file saved to:", dev_sent_file_1) else: dev_sent_file_1 = steps['s3.1sen_select']['out_file'] dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) print("Use preprocessed file:", dev_sent_file_1) # exit(0) if eval_list is not None: print("Evaluating 1st Sentence Selection") # sent_select_results_list_1 = simi_sampler.threshold_sampler(tokenized_file, dev_sent_full_list, # sentence_retri_scale_prob, top_n=5) # additional_dev_sent_list = common.load_jsonl("/Users/Eason/RA/FunEver/results/sent_retri_nn/2018_07_20_15-17-59_r/dev_sent_2r.jsonl") # dev_sent_full_list = dev_sent_full_list + additional_dev_sent_list sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique( tokenized_file, dev_sent_list_1, sentence_retri_1_scale_prob, top_n=sent_retri_1_top_k) # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1, # additional_dev_sent_list, # sentence_retri_2_scale_prob, # top_n=5, add_n=1) eval_mode = {'check_sent_id_correct': True, 'standard': False} # for a, b in zip(eval_list, sent_select_results_list_1): # b['predicted_label'] = a['label'] print( c_scorer.fever_score(sent_select_results_list_1, eval_list, mode=eval_mode, verbose=False)) print("Step 4. Second Document Retrieval") if steps['s4.2doc_retri']['do']: dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) filtered_dev_instance_1_for_doc2 = simi_sampler.threshold_sampler_insure_unique( tokenized_file, dev_sent_list_1, sent_prob_for_2doc, top_n=sent_topk_for_2doc) filtered_dev_instance_1_for_doc2_file = current_pipeline_dir / f"dev_sent_score_1_{in_file_stem}_scaled_for_doc2.jsonl" common.save_jsonl(filtered_dev_instance_1_for_doc2, filtered_dev_instance_1_for_doc2_file) dev_sent_1_result = simi_sampler.threshold_sampler_insure_unique( doc_retrieval_file_1, # Remember this name dev_sent_list_1, sentence_retri_1_scale_prob, top_n=sent_topk_for_2doc) dev_doc2_list = second_doc_retrieval( haonan_docretri_object, filtered_dev_instance_1_for_doc2_file, dev_sent_1_result) dev_doc2_file = current_pipeline_dir / f"doc_retr_2_{in_file_stem}.jsonl" common.save_jsonl(dev_doc2_list, dev_doc2_file) print("Second Document Retrieval File saved to:", dev_doc2_file) else: dev_doc2_file = steps['s4.2doc_retri']['out_file'] # dev_doc2_list = common.load_jsonl(dev_doc2_file) print("Use preprocessed file:", dev_doc2_file) print("Step 5. Second Sentence Selection") if steps['s5.2sen_select']['do']: dev_sent_2_list = get_score_multihop( tokenized_file, dev_doc2_file, model_path=model_path_dict['sselector']) dev_sent_file_2 = current_pipeline_dir / f"dev_sent_score_2_{in_file_stem}.jsonl" common.save_jsonl(dev_sent_2_list, dev_sent_file_2) print("First Sentence Selection file saved to:", dev_sent_file_2) else: dev_sent_file_2 = steps['s5.2sen_select']['out_file'] if eval_list is not None: print("Evaluating 1st Sentence Selection") dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique( tokenized_file, dev_sent_list_1, sentence_retri_1_scale_prob, top_n=5) sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_merge( sent_select_results_list_1, dev_sent_list_2, sentence_retri_2_scale_prob, top_n=5, add_n=sent_retri_2_top_k) eval_mode = {'check_sent_id_correct': True, 'standard': False} # for a, b in zip(eval_list, sent_select_results_list_1): # b['predicted_label'] = a['label'] print( c_scorer.fever_score(sent_select_results_list_1, eval_list, mode=eval_mode, verbose=False)) # print("Step 6. NLI") # if steps['s6.nli']['do']: # dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) # dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) # sentence_retri_1_scale_prob = 0.05 # print("Threshold:", sentence_retri_1_scale_prob) # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(tokenized_file, dev_sent_list_1, # sentence_retri_1_scale_prob, top_n=5) # # sent_select_results_list_2 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1, # # dev_sent_list_2, # # sentence_retri_2_scale_prob, # # top_n=5, # # add_n=sent_retri_2_top_k) # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run(tokenized_file, # sent_select_results_list_1, # [dev_sent_file_1, dev_sent_file_2], # model_path_dict['nli'], # with_logits=True, # with_probs=True) # # nli_results_file = current_pipeline_dir / f"nli_r_{in_file_stem}.jsonl" # common.save_jsonl(nli_results, nli_results_file) # else: # nli_results_file = steps['s6.nli']['out_file'] # nli_results = common.load_jsonl(nli_results_file) # Ensemble code # dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) # dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) # sentence_retri_1_scale_prob = 0.05 # print("NLI sentence threshold:", sentence_retri_1_scale_prob) # sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique(tokenized_file, dev_sent_list_1, # sentence_retri_1_scale_prob, top_n=5) # # # sent_select_results_list_2 = simi_sampler.threshold_sampler_insure_unique_merge(sent_select_results_list_1, # # dev_sent_list_2, # # sentence_retri_2_scale_prob, # # top_n=5, # # add_n=sent_retri_2_top_k) # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run(tokenized_file, # # sent_select_results_list_1, # # [dev_sent_file_1, dev_sent_file_2], # # model_path_dict['nli'], with_probs=True, with_logits=True) # # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run_bigger(tokenized_file, # # sent_select_results_list_1, # # [dev_sent_file_1, dev_sent_file_2], # # model_path_dict['nli_2'], # # with_probs=True, # # with_logits=True) # # nli_results = nli.mesim_wn_simi_v1_2.pipeline_nli_run_bigger(tokenized_file, # sent_select_results_list_1, # [dev_sent_file_1, dev_sent_file_2], # model_path_dict['nli_4'], # with_probs=True, # with_logits=True) # # nli_results_file = current_pipeline_dir / f"nli_r_{in_file_stem}_withlb_e4.jsonl" # common.save_jsonl(nli_results, nli_results_file) # Ensemble code end # exit(0) nli_r_e0 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e0.jsonl") nli_r_e1 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e1.jsonl") nli_r_e2 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e2.jsonl") nli_r_e3 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e3.jsonl") nli_r_e4 = common.load_jsonl(current_pipeline_dir / "nli_r_shared_task_test_withlb_e4.jsonl") nli_results = merge_nli_results( [nli_r_e0, nli_r_e1, nli_r_e2, nli_r_e3, nli_r_e4]) print("Post Processing enhancement") delete_unused_evidence(nli_results) print("Deleting Useless Evidence") dev_sent_list_1 = common.load_jsonl(dev_sent_file_1) dev_sent_list_2 = common.load_jsonl(dev_sent_file_2) print("Appending 1 of second Evidence") nli_results = simi_sampler.threshold_sampler_insure_unique_merge( nli_results, dev_sent_list_2, sentence_retri_2_scale_prob, top_n=5, add_n=sent_retri_2_top_k) delete_unused_evidence(nli_results) # High tolerance enhancement! print("Final High Tolerance Enhancement") print("Appending all of first Evidence") nli_results = simi_sampler.threshold_sampler_insure_unique_merge( nli_results, dev_sent_list_1, enhance_retri_1_scale_prob, top_n=100, add_n=100) delete_unused_evidence(nli_results) if build_submission: output_file = current_pipeline_dir / "predictions.jsonl" build_submission_file(nli_results, output_file)