def main(mode, config_file_path, trained_model_path): # train the segmentor-classifier first h_params = HyperParams(config_file_path) from src.utils.preprocess import get_train_test_split from src.utils.preprocess import prepare_data torch.manual_seed(h_params.rand_seed) model_dir = h_params.models_dir all_models = os.listdir(model_dir) only_best = [m for m in all_models if m.find("2018-12-15") != -1] # delete TODO for model_name in only_best: h_params.use_pos = True if model_name.find("no_POS") == -1 else False for mode in [ DivisionResolution.ESSAY, DivisionResolution.SENTENCE, DivisionResolution.PARAGRAPH ]: _, test_files = get_train_test_split( os.path.abspath( os.path.join(h_params.data_dir, "train-test-split.csv"))) test_data, ept_offsets = prepare_data(mode, test_files, h_params.data_dir) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") SegmentorClassifier = BiLSTM_Segmentor_Classifier if h_params.use_pos else BiLSTM_Segmentor_Classifier_no_pos model = SegmentorClassifier( h_params.d_word_embd, h_params.d_pos_embd, h_params.d_h1, h_params.n_lstm_layers, h_params.word_voc_size, h_params.pos_voc_size, h_params.ac_tagset_size, h_params.batch_size, device, h_params.pretraind_embd_layer_path) # load trained model state-dict checkpoint = torch.load(os.path.join(model_dir, model_name)) model.load_state_dict(checkpoint['model_state_dict']) ## set CUDA if available if torch.cuda.is_available(): model.cuda() # set evaluation mode mode model.eval() # inference for all chosen data preds = [] with torch.no_grad(): for (indexed_tokens, indexed_POSs, indexed_AC_tags) in test_data: tag_scores = model(( indexed_tokens.to(device), indexed_POSs.to(device))) # get log soft max for input preds.append(torch.argmax(tag_scores, dim=1).tolist()) # post-process for fine tuning ac_tag2ix = pickle.load( open(os.path.join(h_params.vocab_dir, "ac_tag2ix.pcl"), 'rb')) corrected_tags = post_process(preds, ac_tag2ix) # save results #results_file = os.path.join("..","exps",os.path.split(trained_model_path)[-1][:-3]+".results") results_file = os.path.join( h_params.exps_dir, "{}|{}.results".format(model_name[:-3], mode)) true_tags = [ac_tags.tolist() for _, _, ac_tags in test_data] with open(results_file, 'wt') as f: # write header for file f.write("\t".join(("# essay_paragraph_token_index", "true AC-tag", "predicted AC-tag", "post processed AC tag")) + '\n') # iterate over results (by appropriate devision) for i_seq in range(len(preds)): for i_tok in range(len(preds[i_seq])): e_p_t_index = ept_offsets[i_seq][i_tok] true_tag = true_tags[i_seq][i_tok] predicted_ac_tag = preds[i_seq][i_tok] post_processed_tag = corrected_tags[i_seq][i_tok] f.write("\t".join( (str(e_p_t_index), str(true_tag), str(predicted_ac_tag), str(post_processed_tag)))) f.write('\n') sys.stdout.write("finished predictions and saved to {}".format( os.path.abspath(results_file)))
def main(mode, config_file_path): # train the segmentor-classifier first h_params = HyperParams(config_file_path) from src.utils.preprocess import get_train_test_split from src.utils.preprocess import prepare_data torch.manual_seed(h_params.rand_seed) training_files, _ = get_train_test_split( os.path.abspath(os.path.join("..", "data", "train-test-split.csv"))) training_data, _ = prepare_data(mode, training_files, data_path=h_params.data_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") SegmentorClassifier = BiLSTM_Segmentor_Classifier if h_params.use_pos else BiLSTM_Segmentor_Classifier_no_pos model = SegmentorClassifier(h_params.d_word_embd, h_params.d_pos_embd, h_params.d_h1, h_params.n_lstm_layers, h_params.word_voc_size, h_params.pos_voc_size, h_params.ac_tagset_size, h_params.batch_size, device, h_params.pretraind_embd_layer_path) # set loss function and adam optimizer (using negative log likelihood with adam optimizer loss_function = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=h_params.learning_rate, weight_decay=h_params.weight_decay) ## set CUDA if available if torch.cuda.is_available(): model.cuda() loss_function.cuda() # display parameters in model for param_tensor in model.state_dict(): print(param_tensor, "\t", model.state_dict()[param_tensor].size()) # display optimizers paramersparamete print("Optimizer's state_dict:") for var_name in optimizer.state_dict(): print(var_name, "\t", optimizer.state_dict()[var_name]) # set train mode model.train() for epoch in range(h_params.n_epochs): start_time = time.time() acc_loss = 0.0 # accumalating loss per epoch for display for (indexed_tokens, indexed_POSs, indexed_AC_tags) in tqdm(training_data): # reset accumalated gradients and lstm's hidden state between iterations model.zero_grad() model.hidden1 = model.init_hidden(model.h1dimension) # make a forward pass tag_scores = model( (indexed_tokens.to(device), indexed_POSs.to(device))) # backprop loss = loss_function(tag_scores, indexed_AC_tags.to(device)) acc_loss += loss.item() loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), h_params.clip_threshold) # call optimizer step optimizer.step() end_time = time.time() # output stats sys.stdout.write( "===> Epoch[{}/{}]: Loss: {:.4f} , time = {:d}[s]\n".format( epoch + 1, h_params.n_epochs, acc_loss, int(end_time - start_time))) if epoch in [25, 50, 75]: try: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, os.path.abspath( os.path.join( h_params.models_dir, "{}_SegClass_mode-{}_ep-{}_{}.pt".format( str(date.today()), mode, epoch, "no_POS" if not h_params.use_pos else "")))) except: sys.stdout.write( 'failed to save model in epoch {}\n'.format(epoch)) # save model torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, os.path.abspath( os.path.join( h_params.models_dir, "{}_SegClass_{}_ep-{}_{}.pt".format( str(date.today()), mode, epoch, "no_POS" if not h_params.use_pos else "")))) #announce end sys.stdout.write("finished training")
def main(config_file_path, baseline): # manual random seed h_params: HyperParams = HyperParams(config_file_path) torch.manual_seed(h_params.rand_seed) training_files, _ = get_train_test_split( os.path.abspath(os.path.join("..", "data", "train-test-split.csv"))) training_data = prepare_relations_data( training_files, h_params.data_dir, h_params.vocab_dir, save=False ) # list of (ac_dict, [(ac_id,ac_id),type]) tupels for each essay device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # define use of constructed features aided model or bland lstm - basically a 2 layered consequtive lstm with further 2 linear layers and ReLU activations if baseline: RelationClassifier = BaselineConstructedRelationClassifier if h_params.d_distance_embd != 0 else BaselineRelationClassifier else: RelationClassifier = BiLSTMRelationClassifier if h_params.d_distance_embd != 0 else BlandRelationClassifier model = RelationClassifier(h_params.d_word_embd, h_params.d_pos_embd, h_params.d_h1, h_params.n_lstm_layers, h_params.word_voc_size, h_params.pos_voc_size, h_params.ac_tagset_size, h_params.batch_size, device, h_params.pretraind_embd_layer_path, h_params.rel_tagset_size, h_params.d_tag_embd, h_params.d_small_embd, h_params.d_distance_embd, h_params.d_h2, h_params.d_h3) model.to(device) # if using previous trained model weights for transfer learning (weights) if h_params.pretrained_segmentor_path: checkpoint = torch.load(h_params.pretrained_segmentor_path) pre_trained_state_dict = dict(checkpoint['model_state_dict']) model_dict = dict(model.state_dict()) # filter unused keys pre_trained_state_dict = { param: value for param, value in pre_trained_state_dict.items() if param in model_dict } # overwrite new parametes in the model dictionary for param, value in pre_trained_state_dict.items(): model_dict[param] = value # update state dict in the model model.load_state_dict(model_dict) # set loss function and adam optimizer (using negative log likelihood with adam optimizer loss_function = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=h_params.learning_rate, weight_decay=h_params.weight_decay) ## set CUDA if available if torch.cuda.is_available(): model.cuda() loss_function.cuda() # display parameters in model for param_tensor in model.state_dict(): print(param_tensor, "\t", model.state_dict()[param_tensor].size()) # display optimizers paramersparamete print("Optimizer's state_dict:") for var_name in optimizer.state_dict(): print(var_name, "\t", optimizer.state_dict()[var_name]) # set train mode model.train() save_name = "{}{}".format( model.__class__.__name__, "_transfer" if h_params.pretrained_segmentor_path else "") log = open("/home/yochay/ukp_argmining_rnn/logs/" + save_name + ".log", 'wt') for epoch in range(h_params.n_epochs): start_time = time.time() acc_loss = 0.0 # accumalating loss per epoch for display for (ac_dict, ac_pairs, rel_tags) in tqdm(training_data): for i_rel in range(len(ac_pairs)): a_id, b_id = ac_pairs[i_rel][0], ac_pairs[i_rel][1] try: ac_a, ac_b = ac_dict[a_id], ac_dict[b_id] # reset accumalated gradients and lstm's hidden state between iterations model.zero_grad() model.hidden1 = model.init_hidden(model.h1dimension) if not baseline: model.hidden2 = model.init_hidden(model.h2dimension) # make a forward pass tag_scores = model((ac_a, ac_b)) # backprop loss = loss_function(tag_scores, rel_tags[i_rel].view(1).to(device)) acc_loss += loss.item() loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), h_params.clip_threshold) # call optimizer step optimizer.step() except KeyError: try: log.write( "essay {}\tids({},{}) -bad preprocess\n".format( ac_dict[0].essay, a_id, b_id)) except: pass end_time = time.time() # output stats sys.stdout.write( "===> Epoch[{}/{}]: Loss: {:.4f} , time = {:d}[s]\n".format( epoch + 1, h_params.n_epochs, acc_loss, int(end_time - start_time))) if (epoch + 1) % 25 == 0 or epoch in [2, 4, 9]: try: torch.save( { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, os.path.abspath( os.path.join( h_params.models_dir, "{}_ep-{}.pt".format(save_name, epoch + 1)))) except: sys.stdout.write( 'failed to save model in epoch {}\n'.format(epoch + 1)) log.close() # save model torch.save( { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), +- 'loss': loss }, os.path.abspath( os.path.join(h_params.models_dir, "{}_ep-{}.pt".format(save_name, epoch + 1)))) # announce end sys.stdout.write("finished training")
# check how many paragraphs, sentences and essays in train and data splits import os import sys from src.utils.preprocess import get_train_test_split data_path = os.path.join("..", "data") train_files, test_files = get_train_test_split( os.path.join(data_path, 'train-test-split.csv')) for ds in (train_files, test_files): n_tokens = 0 n_sents = 0 n_paragraphs = 0 n_essays = 0 len_para = 0 for essay in ds: n_essays += 1 i_line = 0 with open(os.path.join(data_path, "processed", essay + ".tsv")) as f: for line in f: i_line += 1 if line[:3] == "# p": if len_para == 0: print("ess:{}\tline:{}".format(essay, i_line)) n_paragraphs += 1 len_para = 0 elif line[:3] == "# s": n_sents += 1 else: n_tokens += 1 len_para += 1
def main(config_file_path, trained_model_path, use_gold_segmentation): # get hyper parameters h_params = HyperParams(config_file_path) torch.manual_seed(h_params.rand_seed) _, test_files = get_train_test_split( os.path.abspath(os.path.join("..", "data", "train-test-split.csv"))) test_data = prepare_relations_data(files=test_files, data_dir=os.path.join( h_params.exps_dir, "best_results"), vocab_dir=h_params.vocab_dir, save=True) gold_data = prepare_relations_data(files=test_files, data_dir=os.path.join( h_params.data_dir, "processed"), vocab_dir=h_params.vocab_dir, save=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if trained_model_path.find("BiLSTMRelationClassifier_transfer") != 1: RelationsClassifier = BiLSTMRelationClassifier elif trained_model_path.find("BlandRelationClassifier") != 1: RelationsClassifier = BlandRelationClassifier elif trained_model_path.find("BaselineConstructedRelationClassifier") != 1: RelationsClassifier = BaselineConstructedRelationClassifier else: RelationsClassifier = BaselineRelationClassifier model = RelationsClassifier( h_params.d_word_embd, h_params.d_pos_embd, h_params.d_h1, h_params.n_lstm_layers, h_params.word_voc_size, h_params.pos_voc_size, h_params.ac_tagset_size, h_params.batch_size, device, h_params.pretraind_embd_layer_path, h_params.rel_tagset_size, h_params.d_tag_embd, h_params.d_small_embd, h_params.d_distance_embd, h_params.d_h2, h_params.d_h3) # load trained model state-dict checkpoint = torch.load(trained_model_path) model.load_state_dict(checkpoint['model_state_dict']) ## set CUDA if available if torch.cuda.is_available(): model.cuda() # set evaluation mode mode model.eval() # inference for all chosen data preds = [] essay_pairs_types = [] true_rel_tag = [] with torch.no_grad(): for (ac_dict, ac_pairs, rel_tags) in test_data: try: a_id, b_id = ac_pairs[0], ac_pairs[1] ac_a, ac_b = ac_dict[a_id], ac_dict[b_id] tag_scores = model((ac_a, ac_b)) # get log soft max for input preds.append(torch.argmax(tag_scores, dim=1).tolist()) essay_pairs_types.append( (ac_a.essay, ac_pairs, ac_a.type, ac_b.type)) true_rel_tag.append(rel_tags[(ac_a, ac_b)]) except: pass # for debug - bad preprocessed files # save results results_file = os.path.join( h_params.exps_dir, os.path.split(trained_model_path)[-1][:-3] + ".results") with open(results_file, 'wt') as f: # write header for file f.write("\t".join(("#essay", "ac_id_pairs", "ac_a type", "ac_b type", "prediction", "y_true")) + '\n') # iterate over results (by appropriate devision) for i_pred in range(len(preds)): essay, pair, a_type, b_type = essay_pairs_types[i_pred] pred = preds[i_pred] true = true_rel_tag[i_pred] f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(essay, pair, a_type, b_type, pred, true)) sys.stdout.write("finished predictions and saved to {}".format( os.path.abspath(results_file)))