def train_k_fold(k): ''' 2. 3 set up Dataloader for batching ''' fold_size = int(len(raw_train_toefl) / 10) for i in range(k): val_indices = [z for z in range(i * fold_size, (i + 1) * fold_size)] embedded_train_toefl = [[sentences[j], labels[j]] for j in range(len(sentences)) if j not in val_indices] embedded_val_toefl = [[sentences[j], labels[j]] for j in range(len(sentences)) if j in val_indices] train_dataset_toefl = TextDataset( [example[0] for example in embedded_train_toefl], [example[1] for example in embedded_train_toefl]) val_dataset_toefl = TextDataset( [example[0] for example in embedded_val_toefl], [example[1] for example in embedded_val_toefl]) # Data-related hyperparameters batch_size = 64 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_toefl = DataLoader(dataset=train_dataset_toefl, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_toefl = DataLoader(dataset=val_dataset_toefl, batch_size=batch_size, collate_fn=TextDataset.collate_fn) clf, crit = train_model(train_dataloader_toefl, val_dataloader_toefl, i)
def predict_vua(rnn_clf): preds = {} for (embed, txt_sent_id) in embedded_test_vua: ex_data = TextDataset([embed], [0]) ex_dataloader = DataLoader(dataset=ex_data, batch_size=1, collate_fn=TextDataset.collate_fn) pred = predict(ex_dataloader, rnn_clf, using_GPU) preds[txt_sent_id] = pred.item() return preds
def predict_vua_allpos(RNNseq_model): preds = {} for (embed, pos_seq, txt_sent_id) in embedded_test_vua: ex_data = TextDataset([embed], [pos_seq], [[0 for pos in pos_seq]]) ex_dataloader = DataLoader(dataset=ex_data, batch_size=1, collate_fn=TextDataset.collate_fn) pred = predict(ex_dataloader, RNNseq_model, using_GPU) preds[txt_sent_id] = pred[0][0] return preds
def train_k_fold(k): clfs = [] fold_size = int(len(raw_train_vua) / k) for i in range(k): val_indices = [z for z in range(i * fold_size, (i + 1) * fold_size)] embedded_train_vua = [[sentences[j], poss[j], labels[j]] for j in range(len(sentences)) if j not in val_indices] embedded_val_vua = [[sentences[j], poss[j], labels[j]] for j in range(len(sentences)) if j in val_indices] ''' 2. 3 set up Dataloader for batching ''' # Separate the input (embedded_sequence) and labels in the indexed train sets. # raw_train_vua: sentence, label_seq, pos_seq # embedded_train_vua: embedded_sentence, pos, labels train_dataset_vua = TextDataset( [example[0] for example in embedded_train_vua], [example[1] for example in embedded_train_vua], [example[2] for example in embedded_train_vua]) val_dataset_vua = TextDataset( [example[0] for example in embedded_val_vua], [example[1] for example in embedded_val_vua], [example[2] for example in embedded_val_vua]) # Data-related hyperparameters batch_size = 64 # Set up a DataLoader for the training and validation sets train_dataloader_vua = DataLoader(dataset=train_dataset_vua, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_vua = DataLoader(dataset=val_dataset_vua, batch_size=batch_size, collate_fn=TextDataset.collate_fn) clf, crit = train_model(train_dataloader_vua, val_dataloader_vua, i) clfs.append(clf) return clfs
for example in raw_train_vua ] labels = [example[2] for example in raw_train_vua] assert (len(sentences) == len(labels)) ''' 2. 3 set up Dataloader for batching ''' # 10 folds takes up too much RAM, just do 1 fold_size = int(len(raw_train_vua) / 10) embedded_train_vua = [[sentences[i], labels[i]] for i in range(fold_size, len(sentences))] embedded_val_vua = [[sentences[i], labels[i]] for i in range(fold_size)] train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua], [example[1] for example in embedded_train_vua]) val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua], [example[1] for example in embedded_val_vua]) # Data-related hyperparameters batch_size = 64 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_vua = DataLoader(dataset=train_dataset_vua, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_vua = DataLoader(dataset=val_dataset_vua, batch_size=batch_size, collate_fn=TextDataset.collate_fn) # Test set
optimal_accs = [] predictions_all = [] for i in range(10): ''' 2. 3 set up Dataloader for batching ''' training_sentences = [] training_labels = [] training_poss = [] for j in range(10): if j != i: training_sentences.extend(ten_folds[j][0]) training_poss.extend(ten_folds[j][1]) training_labels.extend(ten_folds[j][2]) training_dataset_mohx = TextDataset(training_sentences, training_poss, training_labels) val_dataset_mohx = TextDataset(ten_folds[i][0], ten_folds[i][1], ten_folds[i][2]) # Data-related hyperparameters batch_size = 10 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_mohx = DataLoader(dataset=training_dataset_mohx, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_mohx = DataLoader(dataset=val_dataset_mohx, batch_size=batch_size, shuffle=False, collate_fn=TextDataset.collate_fn) """
logging.info("Loaded best validation loss: {}".format(best_val_loss)) logging.info("*" * 50) prev_train_loss = 100 try: while True: for i in range(10): logging.info("10 fold validation turn change...") first_in_fold = True training_sentences = [] training_classes = [] for j in range(10): if j != i: training_sentences.extend(ten_foldsd[j][0]) training_classes.extend(ten_foldsd[j][1]) training_dataset_rcc = TextDataset(training_sentences, training_classes) val_dataset_rcc = TextDataset(ten_foldsd[i][0], ten_foldsd[i][1]) train_dataloader_rcc = DataLoader( dataset=training_dataset_rcc, batch_size=args.batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_rcc = DataLoader(dataset=val_dataset_rcc, batch_size=args.batch_size, shuffle=False, collate_fn=TextDataset.collate_fn) epoch_base += args.num_epochs for epoch in range(args.num_epochs): logging.info("Starting epoch {}".format(epoch + 1 + epoch_base)) for (example_text, labels) in train_dataloader_rcc:
def trainData(): device = t.device('cuda:0' if t.cuda.is_available() else 'cpu') lang_dataset = TextDataset() lang_dataloader = DataLoader(lang_dataset, shuffle=True) input_size = lang_dataset.input_lang_words hidden_size = 256 output_size = lang_dataset.output_lang_words total_epoch = 20 use_attn = False encoder = convert2Cuda(Encoder(input_size, hidden_size)) decoder = convert2Cuda(Decoder(hidden_size, output_size, n_layers=2)) param = list(encoder.parameters()) + list(decoder.parameters()) optimizer = optim.Adam(param, 1e-3) plot_losses = [] criterion = nn.NLLLoss() for epoch in range(total_epoch): start = time.time() running_loss = 0 print_loss_total = 0 total_loss = 0 for i, data in enumerate(lang_dataloader): in_lang, out_lang = data in_lang = convert2Cuda(in_lang) out_lang = convert2Cuda(out_lang) in_lang = Variable(in_lang) out_lang = Variable(out_lang) encoder_outputs = Variable(t.zeros(MAX_LENGTH, encoder.hidden_size)) encoder_outputs = convert2Cuda(encoder_outputs) encoder_hidden = encoder.initHidden() for ei in range(in_lang.size(1)): encoder_output, encoder_hidden = encoder( in_lang[:, ei], encoder_hidden) encoder_outputs[ei] = encoder_output[0][0] decoder_input = Variable(t.LongTensor([[SOS_token]])) decoder_input = convert2Cuda(decoder_input) decoder_hidden = encoder_hidden loss = 0 if not use_attn: for di in range(out_lang.size(1)): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) loss += criterion(decoder_output, out_lang[:, di]) topv, topi = decoder_output.data.topk(1) ni = topi[0][0] decoder_input = Variable(t.LongTensor([[ni]])) decoder_input = convert2Cuda(decoder_input) if ni == EOS_token: break optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.data[0] print_loss_total += loss.data[0] total_loss += loss.data[0] if (i + 1) % 100 == 0: print('{}/{},Loss:{:.6f}'.format(i + 1, len(lang_dataloader), running_loss / 10)) running_loss = 0 if (i + 1) % 10 == 0: plot_loss = print_loss_total / 100 plot_losses.append(plot_loss) print_loss_total = 0 during = time.time() - start print('Finish {}/{},Loss:{:.6f}.,Time:{:.0f}s\n'.format( epoch + 1, total_epoch, total_loss / len(lang_dataset), during)) showPlot(plot_losses) t.save(encoder.state_dict(), './model/encoder.pth') t.save(decoder.state_dict(), './model/decoder.pth')
Data Embedding optional: Bert or Glove. Default Glove """ word2idx, idx2word = get_word2idx_idx2word(train_vocab) glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False) train_embedded_text, train_labels = embed_sentences(train_data, word2idx, glove_embeddings) test_embedded_text, test_labels = embed_sentences(test_data, word2idx, glove_embeddings) """ Produce Dataset & DataLoader """ train_dataset = TextDataset(train_embedded_text, train_sample_ms, train_sam_sen_ms, train_labels) test_dataset = TextDataset(test_embedded_text, test_sample_ms, test_sam_sen_ms, test_labels) batch_size = 4 train_dataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) test_dataLoader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) """ Model loading and training """
optimal_ps = [] optimal_rs = [] optimal_accs = [] predictions_all = [] for i in range(10): ''' 2. 3 set up Dataloader for batching ''' training_sentences = [] training_labels = [] for j in range(10): if j != i: training_sentences.extend(ten_folds[j][0]) training_labels.extend(ten_folds[j][1]) training_dataset_poetry = TextDataset(training_sentences, training_labels) val_dataset_poetry = TextDataset(ten_folds[i][0], ten_folds[i][1]) # Data-related hyperparameters batch_size = 20 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_poetry = DataLoader(dataset=training_dataset_poetry, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_poetry = DataLoader(dataset=val_dataset_poetry, batch_size=batch_size, shuffle=False, collate_fn=TextDataset.collate_fn) """ 3. Model training
logging.info("*" * 50) prev_train_loss = 999999 try: while True: for i in range(1000): logging.info("1000 fold validation turn change...") first_in_fold = True training_sentences = [] training_labels = [] for j in range(1000): if j != i: training_sentences.extend(thousand_folds[j][0]) training_labels.extend(thousand_folds[j][1]) training_dataset_rcc = TextDataset(training_sentences, training_labels, word2idx, glove_embeddings, elmo) val_dataset_rcc = TextDataset(thousand_folds[i][0], thousand_folds[i][1], word2idx, glove_embeddings, elmo) train_dataloader_rcc = DataLoader( dataset=training_dataset_rcc, batch_size=args.batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_rcc = DataLoader(dataset=val_dataset_rcc, batch_size=args.batch_size, shuffle=False, collate_fn=TextDataset.collate_fn) epoch_base += args.num_epochs for epoch in range(args.num_epochs):
def train_model(): optimal_f1s = [] optimal_ps = [] optimal_rs = [] optimal_accs = [] # predictions_all = [] for i in tqdm(range(10)): ''' 2. 3 set up Dataloader for batching ''' training_sentences = [] training_labels = [] for j in range(10): if j != i: training_sentences.extend(ten_folds[j][0]) training_labels.extend(ten_folds[j][1]) training_dataset_trofi = TextDataset(training_sentences, training_labels) val_dataset_trofi = TextDataset(ten_folds[i][0], ten_folds[i][1]) # Data-related hyperparameters batch_size = 10 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_trofi = DataLoader(dataset=training_dataset_trofi, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_trofi = DataLoader(dataset=val_dataset_trofi, batch_size=batch_size, shuffle=False, collate_fn=TextDataset.collate_fn) """ 3. Model training """ ''' 3. 1 set up model, loss criterion, optimizer ''' # Instantiate the model # embedding_dim = glove + elmo + suffix indicator # dropout1: dropout on input to RNN # dropout2: dropout in RNN; would be used if num_layers=1 # dropout3: dropout on hidden state of RNN to linear layer rnn_clf = RNNSequenceClassifier(num_classes=2, embedding_dim=300+1024+50, hidden_size=300, num_layers=1, bidir=True, dropout1=0.2, dropout2=0, dropout3=0) # Move the model to the GPU if available if using_GPU: rnn_clf = rnn_clf.cuda() # Set up criterion for calculating loss nll_criterion = nn.NLLLoss() # Set up an optimizer for updating the parameters of the rnn_clf rnn_clf_optimizer = optim.Adam(rnn_clf.parameters(), lr=0.001) # Number of epochs (passes through the dataset) to train the model for. num_epochs = 15 ''' 3. 2 train model ''' training_loss = [] val_loss = [] training_f1 = [] val_f1 = [] val_p = [] val_r = [] val_acc = [] # A counter for the number of gradient updates num_iter = 0 train_dataloader = train_dataloader_trofi val_dataloader = val_dataloader_trofi model_index = 0 for epoch in range(num_epochs): # print("Starting epoch {}".format(epoch + 1)) for (example_text, example_lengths, labels) in train_dataloader: example_text = Variable(example_text) example_lengths = Variable(example_lengths) labels = Variable(labels) if using_GPU: example_text = example_text.cuda() example_lengths = example_lengths.cuda() labels = labels.cuda() # predicted shape: (batch_size, 2) predicted = rnn_clf(example_text, example_lengths) batch_loss = nll_criterion(predicted, labels) rnn_clf_optimizer.zero_grad() batch_loss.backward() rnn_clf_optimizer.step() num_iter += 1 # Calculate validation and training set loss and accuracy every 200 gradient updates if num_iter % 200 == 0: avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader, rnn_clf, nll_criterion, using_GPU) val_loss.append(avg_eval_loss) val_f1.append(f1) val_p.append(precision) val_r.append(recall) val_acc.append(eval_accuracy.item()) # print( # "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format( # num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1)) # filename = f'../models/classification/TroFi_fold_{str(i)}_iter_{str(num_iter)}.pt' # torch.save(rnn_clf, filename) model_index += 1 # avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(train_dataloader, rnn_clf, nll_criterion, using_GPU) # training_loss.append(avg_eval_loss) # training_f1.append(f1) # print( # "Iteration {}. Training Loss {}. Training Accuracy {}. Training Precision {}. Training Recall {}. Training F1 {}. Training class-wise F1 {}.".format( # num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1)) """ additional trianing! """ # rnn_clf_optimizer = optim.Adam(rnn_clf.parameters(), lr=0.0005) # for epoch in range(num_epochs): # print("Starting epoch {}".format(epoch + 1)) # for (example_text, example_lengths, labels) in train_dataloader: # example_text = Variable(example_text) # example_lengths = Variable(example_lengths) # labels = Variable(labels) # if using_GPU: # example_text = example_text.cuda() # example_lengths = example_lengths.cuda() # labels = labels.cuda() # # predicted shape: (batch_size, 2) # predicted = rnn_clf(example_text, example_lengths) # batch_loss = nll_criterion(predicted, labels) # rnn_clf_optimizer.zero_grad() # batch_loss.backward() # rnn_clf_optimizer.step() # num_iter += 1 # # Calculate validation and training set loss and accuracy every 200 gradient updates # if num_iter % 100 == 0: # avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader, rnn_clf, nll_criterion, using_GPU) # val_loss.append(avg_eval_loss) # val_f1.append(f1) # val_p.append(precision) # val_r.append(recall) # val_acc.append(eval_accuracy) # print( # "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format( # num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1)) # model_index += 1 # print("Training done for fold {}".format(i)) """ 3.3 plot the training process: MET F1 and losses for validation and training dataset """ # plt.figure(0) # plt.title('F1 for TroFI dataset on fold ' + str(i)) # plt.xlabel('iteration (unit:200)') # plt.ylabel('F1') # plt.plot(val_f1,'g') # plt.plot(val_p,'r') # plt.plot(val_r,'b') # plt.plot(val_acc,'c') # plt.plot(training_f1, 'b') # plt.legend(['Validation F1', 'Validation precision', 'validaiton recall', 'validation accuracy', 'Training F1'], loc='upper right') # plt.show() # plt.figure(1) # plt.title('Loss for TroFi dataset on fold ' + str(i)) # plt.xlabel('iteration (unit:200)') # plt.ylabel('Loss') # plt.plot(val_loss,'g') # plt.plot(training_loss, 'b') # plt.legend(['Validation loss', 'Training loss'], loc='upper right') # plt.show() """ store the best f1 """ # print('val_f1: ', val_f1) idx = 0 if math.isnan(max(val_f1)): optimal_f1s.append(max(val_f1[6:])) idx = val_f1.index(optimal_f1s[-1]) optimal_ps.append(val_p[idx]) optimal_rs.append(val_r[idx]) optimal_accs.append(val_acc[idx]) else: optimal_f1s.append(max(val_f1)) idx = val_f1.index(optimal_f1s[-1]) optimal_ps.append(val_p[idx]) optimal_rs.append(val_r[idx]) optimal_accs.append(val_acc[idx]) # filename = '../models/LSTMSuffixElmoAtt_TroFi_fold_' + str(i) + '_epoch_' + str(idx) + '.pt' # temp_model = torch.load(filename) # print('best model: ', filename) # predictions_all.extend(test(val_dataloader_TroFi, temp_model, using_GPU)) return np.mean(np.array(optimal_ps)), np.mean(np.array(optimal_rs)), np.mean(np.array(optimal_f1s)), np.mean(np.array(optimal_accs))
2. 3 set up Dataloader for batching ''' training_sentences = [] training_labels = [] training_poss = [] training_paraphrase = [] training_para_sets = [] for j in range(10): if j != i: training_sentences.extend(ten_folds[j][0]) training_poss.extend(ten_folds[j][1]) training_labels.extend(ten_folds[j][2]) training_paraphrase.extend(ten_folds[j][3]) training_para_sets.extend(ten_folds[j][4]) training_dataset_mohx = TextDataset(training_sentences, training_poss, training_labels, \ training_paraphrase, training_para_sets) val_dataset_mohx = TextDataset(ten_folds[i][0], ten_folds[i][1], ten_folds[i][2], ten_folds[i][3], ten_folds[i][4]) # Data-related hyperparameters batch_size = 2 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_mohx = DataLoader(dataset=training_dataset_mohx, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_mohx = DataLoader(dataset=val_dataset_mohx, batch_size=batch_size, shuffle=False, collate_fn=TextDataset.collate_fn)
def train_model(): optimal_f1s = [] optimal_ps = [] optimal_rs = [] optimal_accs = [] for i in tqdm(range(10)): ''' 2. 3 set up Dataloader for batching ''' training_sentences = [] training_labels = [] for j in range(10): if j != i: training_sentences.extend(ten_folds[j][0]) training_labels.extend(ten_folds[j][1]) training_dataset_mohX = TextDataset(training_sentences, training_labels) val_dataset_mohX = TextDataset(ten_folds[i][0], ten_folds[i][1]) # Data-related hyperparameters batch_size = 10 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_mohX = DataLoader(dataset=training_dataset_mohX, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_mohX = DataLoader(dataset=val_dataset_mohX, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) """ 3. Model training """ ''' 3. 1 set up model, loss criterion, optimizer ''' # Instantiate the model # embedding_dim = glove + elmo + suffix indicator # dropout1: dropout on input to RNN # dropout2: dropout in RNN; would be used if num_layers!=1 # dropout3: dropout on hidden state of RNN to linear layer rnn_clf = RNNSequenceClassifier(num_classes=2, embedding_dim=300+1024+50, hidden_size=300, num_layers=1, bidir=True, dropout1=0.2, dropout2=0, dropout3=0.2) # Move the model to the GPU if available if using_GPU: rnn_clf = rnn_clf.cuda() # Set up criterion for calculating loss nll_criterion = nn.NLLLoss() # Set up an optimizer for updating the parameters of the rnn_clf rnn_clf_optimizer = optim.SGD(rnn_clf.parameters(), lr=0.02, momentum=0.9) # Number of epochs (passes through the dataset) to train the model for. num_epochs = 30 ''' 3. 2 train model ''' training_loss = [] val_loss = [] training_f1 = [] val_f1 = [] val_p = [] val_r = [] val_acc = [] # A counter for the number of gradient updates num_iter = 0 train_dataloader = train_dataloader_mohX val_dataloader = val_dataloader_mohX for epoch in range(num_epochs): # print("Starting epoch {}".format(epoch + 1)) for (example_text, example_lengths, labels) in train_dataloader: example_text = Variable(example_text) example_lengths = Variable(example_lengths) labels = Variable(labels) if using_GPU: example_text = example_text.cuda() example_lengths = example_lengths.cuda() labels = labels.cuda() # predicted shape: (batch_size, 2) predicted = rnn_clf(example_text, example_lengths) batch_loss = nll_criterion(predicted, labels) rnn_clf_optimizer.zero_grad() batch_loss.backward() rnn_clf_optimizer.step() num_iter += 1 # Calculate validation and training set loss and accuracy every 200 gradient updates if num_iter % 200 == 0: avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader, rnn_clf, nll_criterion, using_GPU) val_loss.append(avg_eval_loss) val_f1.append(f1) val_p.append(precision) val_r.append(recall) val_acc.append(eval_accuracy.item()) # print( # "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format( # num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1)) # filename = f'../models/classification/MOHX_fold_{str(i)}_iter_{str(num_iter)}.pt' # torch.save(rnn_clf, filename) avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(train_dataloader, rnn_clf, nll_criterion, using_GPU) training_loss.append(avg_eval_loss) training_f1.append(f1) # print( # "Iteration {}. Training Loss {}. Training Accuracy {}. Training Precision {}. Training Recall {}. Training F1 {}. Training class-wise F1 {}.".format( # num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1)) # print("Training done for fold {}".format(i)) # store the best f1 idx = 0 if math.isnan(max(val_f1)): optimal_f1s.append(max(val_f1[6:])) idx = val_f1.index(optimal_f1s[-1]) optimal_ps.append(val_p[idx]) optimal_rs.append(val_r[idx]) optimal_accs.append(val_acc[idx]) else: optimal_f1s.append(max(val_f1)) idx = val_f1.index(optimal_f1s[-1]) optimal_ps.append(val_p[idx]) optimal_rs.append(val_r[idx]) optimal_accs.append(val_acc[idx]) return np.mean(np.array(optimal_ps)), np.mean(np.array(optimal_rs)), np.mean(np.array(optimal_f1s)), np.mean(np.array(optimal_accs)) # print('F1 on MOH-X by 10-fold = ', optimal_f1s) # print('F1 on MOH-X = ', np.mean(np.array(optimal_f1s))) """
def train_model(): ''' 2. 3 10-fold cross validation ''' # separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument sentences = [example[0] for example in embedded_trofi] poss = [example[1] for example in embedded_trofi] labels = [example[2] for example in embedded_trofi] # ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels) ten_folds = [] fold_size = int(3737 / 10) for i in range(10): ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size], poss[i * fold_size:(i + 1) * fold_size], labels[i * fold_size:(i + 1) * fold_size])) idx2pos = {0: 'words that are not focus verbs', 1: 'focus verb'} optimal_f1s = [] optimal_ps = [] optimal_rs = [] optimal_accs = [] predictions_all = [] for i in tqdm(range(10)): ''' 2. 3 set up Dataloader for batching ''' training_sentences = [] training_labels = [] training_poss = [] for j in range(10): if j != i: training_sentences.extend(ten_folds[j][0]) training_poss.extend(ten_folds[j][1]) training_labels.extend(ten_folds[j][2]) training_dataset_trofi = TextDataset(training_sentences, training_poss, training_labels) val_dataset_trofi = TextDataset(ten_folds[i][0], ten_folds[i][1], ten_folds[i][2]) # Data-related hyperparameters batch_size = 10 # Set up a DataLoader for the training, validation, and test dataset train_dataloader_trofi = DataLoader(dataset=training_dataset_trofi, batch_size=batch_size, shuffle=True, collate_fn=TextDataset.collate_fn) val_dataloader_trofi = DataLoader(dataset=val_dataset_trofi, batch_size=batch_size, shuffle=False, collate_fn=TextDataset.collate_fn) """ 3. Model training """ ''' 3. 1 set up model, loss criterion, optimizer ''' # Instantiate the model # embedding_dim = glove + elmo + suffix indicator # dropout1: dropout on input to RNN # dropout2: dropout in RNN; would be used if num_layers=1 # dropout3: dropout on hidden state of RNN to linear layer RNNseq_model = RNNSequenceModel(num_classes=2, embedding_dim=300 + 1024, hidden_size=300, num_layers=1, bidir=True, dropout1=0.5, dropout2=0, dropout3=0.2) # Move the model to the GPU if available if using_GPU: RNNseq_model = RNNseq_model.cuda() # Set up criterion for calculating loss loss_criterion = nn.NLLLoss() # Set up an optimizer for updating the parameters of the rnn_clf rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.001) # Number of epochs (passes through the dataset) to train the model for. num_epochs = 10 ''' 3. 2 train model ''' train_loss = [] val_loss = [] performance_matrix = None val_f1 = [] val_p = [] val_r = [] val_acc = [] train_f1 = [] # A counter for the number of gradient updates num_iter = 0 model_index = 0 comparable = [] for epoch in range(num_epochs): # print("Starting epoch {}".format(epoch + 1)) for (__, example_text, example_lengths, labels) in train_dataloader_trofi: example_text = Variable(example_text) example_lengths = Variable(example_lengths) labels = Variable(labels) if using_GPU: example_text = example_text.cuda() example_lengths = example_lengths.cuda() labels = labels.cuda() # predicted shape: (batch_size, seq_len, 2) predicted = RNNseq_model(example_text, example_lengths) batch_loss = loss_criterion(predicted.view(-1, 2), labels.view(-1)) rnn_optimizer.zero_grad() batch_loss.backward() rnn_optimizer.step() num_iter += 1 # Calculate validation and training set loss and accuracy every 200 gradient updates if num_iter % 200 == 0: avg_eval_loss, performance_matrix = evaluate( idx2pos, val_dataloader_trofi, RNNseq_model, loss_criterion, using_GPU) val_loss.append(avg_eval_loss) val_p.append(performance_matrix[1][0]) val_r.append(performance_matrix[1][1]) val_f1.append(performance_matrix[1][2]) val_acc.append(performance_matrix[1][3]) # print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss)) # avg_eval_loss, performance_matrix = evaluate(idx2pos, train_dataloader_trofi, RNNseq_model, # loss_criterion, using_GPU) # train_loss.append(avg_eval_loss) # train_f1.append(performance_matrix[1][1]) # print("Iteration {}. Training Loss {}.".format(num_iter, avg_eval_loss)) # print("Training done for fold {}".format(i)) """ 3.3 plot the training process: MET F1 and losses for validation and training dataset """ # plt.figure(0) # plt.title('F1 for TroFI dataset on fold ' + str(i)) # plt.xlabel('iteration (unit:200)') # plt.ylabel('F1') # plt.plot(val_f1, 'g') # # plt.plot(train_f1, 'b') # plt.legend(['Validation F1', 'Training F1'], loc='upper right') # plt.show() # plt.figure(1) # plt.title('Loss for TroFi dataset on fold ' + str(i)) # plt.xlabel('iteration (unit:200)') # plt.ylabel('Loss') # plt.plot(val_loss, 'g') # # plt.plot(train_loss, 'b') # plt.legend(['Validation loss', 'Training loss'], loc='upper right') # plt.show() """ store the best f1 """ # print('val_f1: ', val_f1) idx = 0 if math.isnan(max(val_f1)): optimal_f1s.append(max(val_f1[6:])) idx = val_f1.index(optimal_f1s[-1]) optimal_ps.append(val_p[idx]) optimal_rs.append(val_r[idx]) optimal_accs.append(val_acc[idx]) else: optimal_f1s.append(max(val_f1)) idx = val_f1.index(optimal_f1s[-1]) optimal_ps.append(val_p[idx]) optimal_rs.append(val_r[idx]) optimal_accs.append(val_acc[idx]) """ print out the performance plot the performance on each fold """ # print('F1 on TroFi by 10-fold = ', optimal_f1s) # print('Precision on TroFi = ', np.mean(np.array(optimal_ps))) # print('Recall on TroFi = ', np.mean(np.array(optimal_rs))) # print('F1 on TroFi = ', np.mean(np.array(optimal_f1s))) # print('Accuracy on TroFi = ', np.mean(np.array(optimal_accs))) return optimal_f1s, np.mean(np.array(optimal_ps)), np.mean( np.array(optimal_rs)), np.mean(np.array(optimal_f1s)), np.mean( np.array(optimal_accs))