def hidden_eval_fever(): batch_size = 64 lazy = True SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)" dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) # dev_biterator = BasicIterator(batch_size=batch_size * 2) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 300), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} for item in builded_dev_data: del item['label'] print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
def __init__(self, model_path): # Prepare Data lazy = False token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=420) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') # Build Model # device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) # device_num = -1 if device.type == 'cpu' else 0 device = torch.device("cpu") device_num = -1 if device.type == 'cpu' else 0 biterator = BasicIterator(batch_size=16) biterator.index_with(vocab) model = Model( rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=400) model.display() model.to(device) model.load_state_dict(torch.load(model_path)) self.model = model self.dev_fever_data_reader = dev_fever_data_reader self.device_num = device_num self.biterator = biterator
def utest_data_loader(): num_epoch = 8 seed = 12 batch_size = 32 experiment_name = "mesim_wn_elmo" lazy = True dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl" train_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/train.jsonl" # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() train_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict) dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model complete_upstream_train_data = get_sampled_data(config.T_FEVER_TRAIN_JSONL, train_upstream_file)[:20000] device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 sampled_train_instances = train_fever_data_reader.read(complete_upstream_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num) for i, batch in tqdm(enumerate(train_iter)): pass batch['p_wn_feature'] batch['h_wn_feature'] # print(batch.keys()) # print(batch['p_wn_feature']) # print(batch['h_wn_feature']) wn_persistent_api.persistence_update(p_dict)
def hidden_eval_fever_adv_v1(): batch_size = 64 lazy = True dev_prob_threshold = 0.5 SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-20-22:28:24_mesim_wn_450_adv_sample_v1_|t_prob:0.35|top_k:8/i(46000)_epoch(7)_dev(0.6405140514051405)_loss(1.0761665150348825)_seed(12)" dev_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/2018_07_20_15:17:59_r/dev_sent.jsonl") # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_upstream_sent_list, scale_prob=dev_prob_threshold, delete_prob=False) dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model( rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(SAVE_PATH)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) eval_mode = {'check_sent_id_correct': True, 'standard': True} common.save_jsonl( builded_dev_data, config.RESULT_PATH / "nli_results" / "pipeline_results_1.jsonl") c_scorer.delete_label(builded_dev_data) print( c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.FEVER_DEV_JSONL), mode=eval_mode))
def train_fever_v1_advsample(): num_epoch = 12 seed = 12 batch_size = 32 lazy = True dev_prob_threshold = 0.5 train_prob_threshold = 0.35 train_sample_top_k = 10 experiment_name = f"mesim_wn_450_adv_sample_v1_|t_prob:{train_prob_threshold}|top_k:{train_sample_top_k}" print("Dev prob threshold:", dev_prob_threshold) print("Train prob threshold:", train_prob_threshold) print("Train sample top k:", train_sample_top_k) dev_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/2018_07_20_15:17:59_r/dev_sent.jsonl") train_upstream_sent_list = common.load_jsonl( config.RESULT_PATH / "sent_retri_nn/2018_07_20_15:17:59_r/train_sent.jsonl") # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer( namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_upstream_sent_list, scale_prob=dev_prob_threshold, delete_prob=False) dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) train_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=batch_size * 2) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model( rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.display() model.to(device) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 start_lr = 0.0002 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() for i_epoch in range(num_epoch): print("Resampling...") # Resampling # complete_upstream_train_data = get_sampled_data(config.T_FEVER_TRAIN_JSONL, train_upstream_file) complete_upstream_train_data = get_adv_sampled_data( config.T_FEVER_TRAIN_JSONL, train_upstream_sent_list, threshold_prob=train_prob_threshold, top_n=train_sample_top_k) print("Sample data length:", len(complete_upstream_train_data)) sampled_train_instances = train_fever_data_reader.read( complete_upstream_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) y = batch['label'] loss = criterion(out, y) # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 if i_epoch <= 6: # mod = 5000 mod = 5000 else: mod = 500 if iteration % mod == 0: eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) dev_score, dev_loss = full_eval_model( model, eval_iter, criterion, complete_upstream_dev_data) print(f"Dev:{dev_score}/{dev_loss}") need_save = False if dev_score > best_dev: best_dev = dev_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_dev({dev_score})_loss({dev_loss})_seed({seed})' ) torch.save(model.state_dict(), save_path) # Save some cache wordnet feature. wn_persistent_api.persistence_update(p_dict)
def pipeline_nli_run(t_org_file, upstream_dev_data_list, upstream_sent_file_list, model_path): batch_size = 32 lazy = True print("Size:", len(upstream_dev_data_list)) print("Building Prob Dicts...") selection_dict = dict() for upstream_sent_file in upstream_sent_file_list: upstream_sent_l = common.load_jsonl(upstream_sent_file) selection_dict = paired_selection_score_dict(upstream_sent_l, selection_dict) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } p_dict = wn_persistent_api.persistence_load() dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360) complete_upstream_dev_data = select_sent_with_prob_for_eval(t_org_file, upstream_dev_data_list, selection_dict, tokenized=True, pipeline=True) complete_upstream_dev_data = append_hidden_label(complete_upstream_dev_data) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels') print(vocab.get_token_to_index_vocabulary('labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=300) print("Model Max length:", model.max_l) model.load_state_dict(torch.load(model_path)) model.display() model.to(device) eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) wn_persistent_api.persistence_update(p_dict) return complete_upstream_dev_data