def train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy, epoch_start = 1): logger.info("Training model.") # Set up a new Bert Client, for encoding the wordpieces if cf.EMBEDDING_MODEL == "bert": bc = BertClient() else: bc = None modelEvaluator = ModelEvaluator(model, data_loaders['dev'], word_vocab, wordpiece_vocab, hierarchy, bc) #optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE, momentum=0.9) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE)#, eps=1e-4, amsgrad=True)#, momentum=0.9) model.cuda() num_batches = len(data_loaders["train"]) print(num_batches) progress_bar = ProgressBar(num_batches = num_batches, max_epochs = cf.MAX_EPOCHS, logger = logger) avg_loss_list = [] # Train the model for epoch in range(epoch_start, cf.MAX_EPOCHS + 1): epoch_start_time = time.time() epoch_losses = [] if cf.TASK == "end_to_end": if cf.BATCH_SIZE != 10: print("Warning: batch size must currently be set to 10 for the end-to-end model.") for (i, (batch_x, batch_y, batch_z, _, batch_tx, batch_ty, _)) in enumerate(data_loaders["train"]): if len(batch_x) < cf.BATCH_SIZE: continue batch_y = batch_y.float().to(device) batch_z = batch_z.float().to(device) model.zero_grad() model.train() #if i > 1: # continue # 1. Convert the batch_x from wordpiece ids into wordpieces if cf.EMBEDDING_MODEL == "bert": wordpieces = batch_to_wordpieces(batch_x, wordpiece_vocab) # 2. Encode the wordpieces into Bert vectors bert_embs = wordpieces_to_bert_embs(wordpieces, bc) bert_embs = bert_embs.to(device) y_hat = model(bert_embs) loss = model.calculate_loss(y_hat, batch_x, batch_y, batch_z) elif cf.EMBEDDING_MODEL in ['random', 'glove', 'word2vec']: batch_tx_cuda = batch_tx.long().to(device) batch_ty = batch_ty.float().to(device) #print(batch_tx.size()) y_hat = model(batch_tx_cuda) loss = model.calculate_loss(y_hat, batch_tx, batch_ty, batch_z) # 3. Feed these Bert vectors to our model # 4. Backpropagate loss.backward() optimizer.step() epoch_losses.append(loss) # 5. Draw the progress bar progress_bar.draw_bar(i, epoch, epoch_start_time) elif cf.TASK == "mention_level": for (i, (batch_xl, batch_xr, batch_xa, batch_xm, batch_y)) in enumerate(data_loaders["train"]): #torch.cuda.empty_cache() #if i > 1: # continue # 1. Convert the batch_x from wordpiece ids into wordpieces wordpieces_l = batch_to_wordpieces(batch_xl, wordpiece_vocab) wordpieces_r = batch_to_wordpieces(batch_xr, wordpiece_vocab) #wordpieces_a = batch_to_wordpieces(batch_xa, wordpiece_vocab) wordpieces_m = batch_to_wordpieces(batch_xm, wordpiece_vocab) #print len(wordpieces_l[0]), len(wordpieces_r[0]), len(wordpieces_m[0]) #print len(wordpieces_l[0]), len(wordpieces_r[0]), len(wordpieces_a[0]) , len(wordpieces_m[0]) # 2. Encode the wordpieces into Bert vectors bert_embs_l = wordpieces_to_bert_embs(wordpieces_l, bc).to(device) bert_embs_r = wordpieces_to_bert_embs(wordpieces_r, bc).to(device) #bert_embs_a = wordpieces_to_bert_embs(wordpieces_a, bc).to(device) bert_embs_m = wordpieces_to_bert_embs(wordpieces_m, bc).to(device) batch_y = batch_y.float().to(device) # 3. Feed these Bert vectors to our model model.zero_grad() model.train() y_hat = model(bert_embs_l, bert_embs_r, None, bert_embs_m) loss = model.calculate_loss(y_hat, batch_y) # 4. Backpropagate loss.backward() optimizer.step() epoch_losses.append(loss) # 5. Draw the progress bar progress_bar.draw_bar(i, epoch, epoch_start_time) avg_loss = sum(epoch_losses) / float(len(epoch_losses)) avg_loss_list.append(avg_loss) progress_bar.draw_completed_epoch(avg_loss, avg_loss_list, epoch, epoch_start_time) #logger.info(avg_loss) modelEvaluator.evaluate_every_n_epochs(1, epoch)
def main(): with open("models/%s/params.txt" % cf.MODEL_NAME, "w") as f: f.write("\n".join( ["%s : %s" % (k, cf.__dict__[k]) for k in cf.__dict__])) progress_bar = ProgressBar() data_iterators, word_embeddings, char_embeddings, word_to_ix, ix_to_word, wtag_to_ix, ix_to_wtag, char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = load_data( ) logger.info("Building model...") if cf.GRANULARITY == CHAR_LEVEL: model_class = CharLSTMTagger elif cf.GRANULARITY == WORD_LEVEL: model_class = WordLSTMTagger elif cf.GRANULARITY == CHAR_AND_WORD_LEVEL: model_class = CombinedLSTMTagger if cf.GRANULARITY == WORD_LEVEL and cf.WORD_LEVEL_WITH_FLAGGER: model_class = WordTaggerWithFlagger if cf.GRANULARITY == WORD_LEVEL and cf.EMBEDDING_MODEL == "Bert": model_class = FeedForwardBert #counter = 0 #for w in word_embeddings: # if w[0] == 0: # counter+= 1 # print w[:5] #print counter, len(word_embeddings) #exit() model = model_class( cf.MODEL_TYPE, cf.WORD_EMBEDDING_DIM, cf.CHAR_EMBEDDING_DIM, cf.HIDDEN_DIM, len(char_to_ix), len(ix_to_word), len(wtag_to_ix) if cf.GRANULARITY == WORD_LEVEL else len(ctag_to_ix), cf.BATCH_SIZE, cf.MAX_WORD_LENGTH, cf.MAX_SENT_LENGTH, word_embeddings, char_embeddings) # Ensure the word embeddings aren't modified during training epoch_start = 1 #model.load_state_dict(torch.load('models/%s/model_trained/epoch_90' % cf.MODEL_NAME)) #epoch_start = 90 optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE, momentum=0.9) model.cuda() #if(cf.LOAD_PRETRAINED_MODEL): # model.load_state_dict(torch.load('asset/model_trained')) #else: num_batches = len(data_iterators["train"]) avg_loss_list = [] # A place to store the loss history best_f1 = [0.0, -1] # F1, epoch number for epoch in range(epoch_start, cf.MAX_EPOCHS + 1): epoch_start_time = time.time() epoch_losses = [] for (i, (batch_w, batch_x, batch_y, batch_f)) in enumerate(data_iterators["train"]): #if i > 1: # continue # Ignore batch if it is not the same size as the others (happens at the end sometimes) if len(batch_w) != cf.BATCH_SIZE: print(batch_w) print(len(batch_w)) logger.warn( "A batch did not have the correct number of sentences.") continue # Ignore batch if it is not the same size as the others (happens at the end sometimes) if len(batch_x) != cf.BATCH_SIZE: print(len(batch_x)) logger.warn( "A batch did not have the correct number of words.") continue batch_w = batch_w.to(device) batch_x = batch_x.to(device) if cf.WORD_LEVEL_WITH_FLAGGER: batch_f = batch_f.to(device) # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() batch_x_lengths = [] for x in batch_x: batch_x_lengths.append(np.nonzero(x).size(0)) batch_w_lengths = [] for w in batch_w: batch_w_lengths.append(np.nonzero(w).size(0)) #print batch_x #print batch_y # Step 3. Run our forward pass. model.train() if cf.WORD_LEVEL_WITH_FLAGGER: tag_scores, tag_scores_f = model(batch_f, batch_x, batch_w_lengths, batch_x_lengths) loss = model.calculate_loss(tag_scores, tag_scores_f, batch_y, batch_f) else: tag_scores = model(batch_w, batch_x, batch_w_lengths, batch_x_lengths) loss = model.calculate_loss(tag_scores, batch_y) loss.backward() optimizer.step() epoch_losses.append(loss) progress_bar.draw_bar(i, epoch, num_batches, cf.MAX_EPOCHS, epoch_start_time) avg_loss = sum(epoch_losses) / float(len(epoch_losses)) avg_loss_list.append(avg_loss) progress_bar.draw_completed_epoch(avg_loss, avg_loss_list, epoch, cf.MAX_EPOCHS, epoch_start_time) if epoch % 10 == 0 or epoch == cf.MAX_EPOCHS: f1 = evaluate_model(model, data_iterators["test"], word_to_ix, ix_to_word, wtag_to_ix, ix_to_wtag, char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag, epoch, print_output=True) if f1 > best_f1[0]: best_f1 = [f1, epoch] logger.info("New best F1 score achieved!") logger.info("Saving model...") model_filename = "models/%s/model_trained/epoch_%d" % ( cf.MODEL_NAME, epoch) torch.save(model.state_dict(), model_filename) logger.info("Model saved to %s." % model_filename) elif epoch - best_f1[1] >= 50: logger.info( "No improvement to F1 score in past 50 epochs. Stopping early." ) logger.info("Best F1 Score: %.4f" % best_f1[0]) return
def train(model, data_loader_train, data_loader_dev, dataset_dev, ground_truth_triples, epoch_start=1): logger.info("Training model.") modelEvaluator = ModelEvaluator(model, data_loader_dev, dataset_dev, ground_truth_triples, cf) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE) #, momentum=0.9) model.cuda() num_batches = len(data_loader_train) progress_bar = ProgressBar(num_batches=num_batches, max_epochs=cf.MAX_EPOCHS, logger=logger) avg_loss_list = [] # Train the model for epoch in range(epoch_start, cf.MAX_EPOCHS + 1): epoch_start_time = time.time() epoch_losses = [] for (i, (batch_idx, batch_doc_idx, batch_d, batch_h, batch_r, batch_t, batch_y)) in enumerate(data_loader_train): # 1. Place each component onto CUDA batch_d = batch_d.to(device) batch_h = batch_h.to(device) batch_r = batch_r.to(device) batch_t = batch_t.to(device) batch_y = batch_y.float().to(device) # 2. Feed these Bert vectors to our model model.zero_grad() model.train() y_hat = model(batch_d, batch_h, batch_r, batch_t) # 3. Calculate the loss via BCE loss = model.calculate_loss(y_hat, batch_y) # 4. Backpropagate loss.backward() optimizer.step() epoch_losses.append(loss) # 5. Draw the progress bar progress_bar.draw_bar(i, epoch, epoch_start_time) avg_loss = sum(epoch_losses) / float(len(epoch_losses)) avg_loss_list.append(avg_loss) progress_bar.draw_completed_epoch(avg_loss, avg_loss_list, epoch, epoch_start_time) modelEvaluator.evaluate_every_n_epochs(1, epoch)
def train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy, ground_truth_triples, epoch_start=1): logger.info("Training model.") # Set up a new Bert Client, for encoding the wordpieces bc = BertClient() modelEvaluator = ModelEvaluator(model, data_loaders['dev'], word_vocab, wordpiece_vocab, hierarchy, ground_truth_triples, cf) #optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE, momentum=0.9) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE) #, momentum=0.9) model.cuda() print(cf.LEARNING_RATE) num_batches = len(data_loaders["train"]) max_epochs = 1000 progress_bar = ProgressBar(num_batches=num_batches, max_epochs=max_epochs, logger=logger) avg_loss_list = [] # Train the model for epoch in range(epoch_start, max_epochs + 1): epoch_start_time = time.time() epoch_losses = [] for (i, (batch_x, batch_y, batch_z, _, batch_tx, _, _)) in enumerate(data_loaders["train"]): if len(batch_x) < cf.BATCH_SIZE: continue # 1. Convert wordpiece ids into wordpiece tokens wordpieces = batch_to_wordpieces(batch_x, wordpiece_vocab) wordpiece_embs = wordpieces_to_bert_embs(wordpieces, bc) # 2. Create sin embeddings and concatenate them to the bert embeddings wordpiece_embs = wordpiece_embs.to(device) batch_y = batch_y.float().to(device) batch_z = batch_z.float().to(device) # 3. Feed these vectors to our model if cf.POSITIONAL_EMB_DIM > 0: sin_embs = SinusoidalPositionalEmbedding( embedding_dim=cf.POSITIONAL_EMB_DIM, padding_idx=0, left_pad=True) sin_embs = sin_embs( torch.ones([batch_x.size()[0], batch_x.size()[1]])).to(device) joined_embs = torch.cat((wordpiece_embs, sin_embs), dim=2) else: joined_embs = wordpiece_embs # if len(batch_x) < cf.BATCH_SIZE: # zeros = torch.zeros((cf.BATCH_SIZE - len(batch_x), joined_embs.size()[1], joined_embs.size()[2])).to(device) # joined_embs = torch.cat((joined_embs, zeros), dim=0) # print(joined_embs) # print(joined_embs.size()) model.zero_grad() model.train() y_hat = model(joined_embs) loss = model.calculate_loss(y_hat, batch_x, batch_y, batch_z) # 4. Backpropagate loss.backward() optimizer.step() epoch_losses.append(loss) # 5. Draw the progress bar progress_bar.draw_bar(i, epoch, epoch_start_time) avg_loss = sum(epoch_losses) / float(len(epoch_losses)) avg_loss_list.append(avg_loss) progress_bar.draw_completed_epoch(avg_loss, avg_loss_list, epoch, epoch_start_time) modelEvaluator.evaluate_every_n_epochs(1, epoch)
def main(): progress_bar = ProgressBar() data_iterator, glove_embeddings, word_to_ix, ix_to_word = load_data() logger.info("Building model...") model = LSTMTagger(cf.EMBEDDING_DIM, cf.HIDDEN_DIM, len(word_to_ix), cf.BATCH_SIZE, cf.MAX_SENT_LENGTH, glove_embeddings) # Ensure the word embeddings aren't modified during training optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.1) model.cuda() #if(cf.LOAD_PRETRAINED_MODEL): # model.load_state_dict(torch.load('asset/model_trained')) #else: num_batches = len(data_iterator) loss_list = [] # A place to store the loss history for epoch in range(1, cf.MAX_EPOCHS + 1): epoch_start_time = time.time() for (i, (batch_x, batch_y)) in enumerate(data_iterator): # Ignore batch if it is not the same size as the others (happens at the end sometimes) if len(batch_x) != cf.BATCH_SIZE: continue batch_x = batch_x.to(device) # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. model.hidden = model.init_hidden() # Step 2. Get our inputs ready for the network, that is, turn them into # Tensors of word indices. #sentence_in = prepare_sequence(sentence, word_to_ix) #target = torch.tensor([word_to_ix[tag]], dtype=torch.long, device=device) batch_x_lengths = [] for x in batch_x: batch_x_lengths.append(len(x)) # Step 3. Run our forward pass. tag_scores = model(batch_x, batch_x_lengths) #loss = loss_function(tag_scores, batch_y) loss = modified_loss(tag_scores, batch_y, batch_x_lengths, word_to_ix) loss.backward() optimizer.step() progress_bar.draw_bar(i, epoch, num_batches, cf.MAX_EPOCHS, epoch_start_time) progress_bar.draw_completed_epoch(loss, loss_list, epoch, cf.MAX_EPOCHS, epoch_start_time) loss_list.append(loss) if epoch % 10 == 0: avg_loss = sum([l for l in loss_list[epoch - 10:]]) / 10 logger.info("Average loss over past 10 epochs: %.6f" % avg_loss) if epoch >= 20: prev_avg_loss = sum( [l for l in loss_list[epoch - 20:epoch - 10]]) / 10 if (avg_loss >= prev_avg_loss): logger.info( "Average loss has not improved over past 10 epochs. Stopping early." ) evaluate_model(model, ix_to_word) break if epoch == 1 or epoch % 10 == 0 or epoch == cf.MAX_EPOCHS: evaluate_model(model, ix_to_word) logger.info("Saving model...") torch.save(model.state_dict(), "asset/model_trained") logger.info("Model saved to %s." % "asset/model_trained")