def main(train_file, dev_file, vocab_file, target_dir, max_length=50, hidden_size=300, dropout=0.2, num_classes=2, epochs=1, batch_size=256, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, checkpoint=None): #device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") device = torch.device("cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = LCQMC_Dataset(train_file, vocab_file, max_length) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = LCQMC_Dataset(dev_file, vocab_file, max_length) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") # embeddings = load_embeddings(embeddings_file) model = ESIM(hidden_size, dropout=dropout, num_labels=num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # print('a') criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) print('b') # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion) print( "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print( "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) # Save the model at each epoch. torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
y = y.to(device) y_hat = net(sent1.to(device), sent2.to(device), mask1.to(device), mask2.to(device)) l = crition(y_hat, y) optimizer.zero_grad() l.backward() optimizer.step() train_l_sum += l batch_num += 1 n += y.shape[0] train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item() if batch_num % 500 == 0: val_acc, val_loss = val_test(net, dev_iter, crition) if val_acc >= best_acc: best_acc = val_acc torch.save(net.state_dict(), args.model_path) print( 'batch: %d, train_loss: %.3f train_acc: %.4f val_loss: %.3f val_acc: %.4f time: %.3F' % (batch_num, train_l_sum / batch_num, train_acc_sum / n, val_loss, val_acc, time.time() - start)) # # 加载测试数据 # def load_test_data(args): # text = data.Field(sequential= True, use_vocab = True, lower = True, fix_length = args.max_len, include_lengths = True,batch_first = True) # fields = [('sentence1',text),('sentence2',text)] # # 读取数据 # test = [] # with open('./data/snli.test','r') as f: # for line in f.readlines(): # sents = line.strip().split('|||') # test.append(data.Example.fromlist([sents[0],sents[1]],fields))
def main(): device = args.device print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) # -------------------- Data loading ------------------- # print("Loading data......") train_loader, dev_loader, test_loader, SEN1, SEN2 = load_data( args.batch_size, device) embedding = SEN1.vectors vocab_size = len(embedding) print("vocab_size:", vocab_size) # -------------------- Model definition ------------------- # print("\t* Building model...") model = ESIM(args.hidden_size, embedding=embedding, dropout=args.dropout, num_labels=args.num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=10) best_score = 0.0 if args.ckp: checkpoint = torch.load(os.path.join(args.target_dir, args.ckp)) best_score = checkpoint["best_score"] model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) _, valid_loss, valid_accuracy = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(args.num_epoch): print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, args.max_grad_norm, device) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = validate( model, dev_loader, criterion, device) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: print("save model!!!!") best_score = epoch_accuracy patience_counter = 0 torch.save( { "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), }, os.path.join(args.target_dir, "best.pth.tar")) if patience_counter >= 5: print("-> Early stopping: patience limit reached, stopping...") break # ##-------------------- Testing epochs ------------------- # # print(20 * "=", " Testing ", 20 * "=") # if args.ckp: # checkpoint = torch.load(os.path.join(args.target_dir, args.ckp)) # best_score = checkpoint["best_score"] # model.load_state_dict(checkpoint["model"]) # optimizer.load_state_dict(checkpoint["optimizer"]) # # print("best_score:", best_score) # all_labels = test(model, test_loader, device) # print(all_labels[:10]) # target_label = [id2label[id] for id in all_labels] # print(target_label[:10]) # with open(os.path.join(args.target_dir, 'result.txt'), 'w+') as f: # for label in target_label: # f.write(label + '\n') del train_loader del dev_loader del test_loader del SEN1 del SEN2 del embedding
(epoch + 1, epoch_mins, epoch_secs)) f_log.write('\tTrain Loss: %.3f | Train Acc: %.2f %%\n' % (train_loss, train_acc * 100)) f_log.write('\t Val. Loss: %.3f | Val. Acc: %.2f %%\n' % (valid_loss, valid_acc * 100)) print('Epoch: %d | Epoch Time: %dm %ds' % (epoch + 1, epoch_mins, epoch_secs)) print('\tTrain Loss: %.3f | Train Acc: %.2f %%' % (train_loss, train_acc * 100)) print('\t Val. Loss: %.3f | Val. Acc: %.2f %%' % (valid_loss, valid_acc * 100)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_valid_acc = valid_acc torch.save(model.state_dict(), './saved_model/esim.pt') print("New model saved!") f_log.write("New model saved!\n") f_log.flush() f_log.close() model.load_state_dict(torch.load('./saved_model/esim.pt')) model.eval() f_valid = open("data/test-set.data", "r", encoding='utf-8') f_res = open('prediction.txt', 'w') for i, rowlist in enumerate(f_valid): rowlist = rowlist[:-1].split('\t') input_sent = [] for sent in rowlist[:2]:
def model_train_validate_test(train_df, dev_df, test_df, embeddings_file, vocab_file, target_dir, mode, num_labels=2, max_length=50, hidden_size=200, dropout=0.2, epochs=50, batch_size=256, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, if_save_model=False, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = My_Dataset(train_df, vocab_file, max_length, mode) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = My_Dataset(dev_df, vocab_file, max_length, mode) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") if (embeddings_file is not None): embeddings = load_embeddings(embeddings_file) else: embeddings = None model = ESIM(hidden_size, embeddings=embeddings, dropout=dropout, num_labels=num_labels, device=device).to(device) total_params = sum(p.numel() for p in model.parameters()) print(f'{total_params:,} total parameters.') total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'{total_trainable_params:,} training parameters.') # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, _, = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 if (if_save_model): torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) print("save model succesfully!\n") print("* Test for epoch {}:".format(epoch)) _, _, test_accuracy, predictions = validate( model, test_loader, criterion) print("Test accuracy: {:.4f}%\n".format(test_accuracy)) test_prediction = pd.DataFrame({'prediction': predictions}) test_prediction.to_csv(os.path.join(target_dir, "test_prediction.csv"), index=False) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
def main(args): print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") # train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False) train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) print("\t* Loading valid data...") dev_data = LCQMC_dataset(args.dev_file, args.vocab_file, args.max_length, test_flag=False) dev_loader = DataLoader(dev_data, batch_size=args.batch_size, shuffle=True) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = load_embeddings(args.embed_file) model = ESIM(args, embeddings=embeddings).to(args.device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 交叉熵损失函数 # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=args.lr) # 优化器 # 学习计划 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if args.checkpoint: # 从文件中加载checkpoint数据, 从而继续训练模型 checkpoints = torch.load(args.checkpoint) start_epoch = checkpoints["epoch"] + 1 best_score = checkpoints["best_score"] print("\t* Training will continue on existing model from epoch {}...".format(start_epoch)) model.load_state_dict(checkpoints["model"]) # 模型部分 optimizer.load_state_dict(checkpoints["optimizer"]) epochs_count = checkpoints["epochs_count"] train_losses = checkpoints["train_losses"] valid_losses = checkpoints["valid_losses"] # 这里改为只有从以前加载的checkpoint中才进行计算 valid, Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}" .format(valid_loss, (valid_accuracy * 100), auc)) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(args.device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, args.epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, args.max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%" .format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(model, train_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n" .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc)) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # 保存最好的结果,需要保存的参数,这些参数在checkpoint中都能找到 torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(args.target_dir, "new_best.pth.tar")) # 保存每个epoch的结果 Save the model at each epoch.(这里可要可不要) torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(args.target_dir, "new_esim_{}.pth.tar".format(epoch))) if patience_counter >= args.patience: print("-> Early stopping: patience limit reached, stopping...") break
def main(test_file, pretrained_file, embeddings_file, batch_size=1): """ Test the ESIM model with pretrained weights on some dataset. Args: test_file: The path to a file containing preprocessed NLI data. pretrained_file: The path to a checkpoint produced by the 'train_model' script. vocab_size: The number of words in the vocabulary of the model being tested. embedding_dim: The size of the embeddings in the model. hidden_size: The size of the hidden layers in the model. Must match the size used during training. Defaults to 300. num_classes: The number of classes in the output of the model. Must match the value used during training. Defaults to 3. batch_size: The size of the batches used for testing. Defaults to 32. """ debug_file = open('test_debug.txt', 'w') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #device = torch.device("cpu") print(20 * "=", " Preparing for generating representations ", 20 * "=") checkpoint = torch.load(pretrained_file) # Retrieving model parameters from checkpoint. vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0) embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1) hidden_size = checkpoint["model"]["_projection.0.weight"].size(0) num_classes = checkpoint["model"]["_classification.4.weight"].size(0) print("\t* Loading the data...") with open(test_file, "rb") as pkl: test_data = NLIDataset(pickle.load(pkl)) print(test_data, file=debug_file) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") # loading the embedding weights separately # with open(embeddings_file, "rb") as pkl: pkl = open(embeddings_file, "rb") embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float)\ .to(device) pkl.close() # model = ESIM(vocab_size, # embedding_dim, # hidden_size, # num_classes=num_classes, # device=device).to(device) model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size, embeddings=embeddings, num_classes=num_classes, device=device).to(device) # Writing custom load_state_dict pretrained_dict = checkpoint["model"] own_state = model.state_dict() for i, (name, param) in enumerate(pretrained_dict.items()): #print(name, type(name)) # if name is "_word_embedding.weight": # print(name) # continue if i == 0: continue if isinstance(param, Parameter): # backwards compatibility for serialized parameters param = param.data own_state[name].copy_(param) #model.load_state_dict(checkpoint["model"]) print( 20 * "=", " Loading the representations from ESIM model on device: {} ".format( device), 20 * "=") batch_time, total_time, save_rep = test(model, test_loader) print("-> Average batch processing time: {:.4f}s, total test time:\ {:.4f}s,%".format(batch_time, total_time)) file_debug = open('test_save_rep_details.txt', 'w') print('len of save_rep is' + str(len(save_rep)), file=file_debug) try: print('save_rep sample key is' + str(list(save_rep.keys())[0]), file=file_debug) print('save_rep sample value is' + str(list(save_rep.values())[0]), file=file_debug) except: pass # Dump save_rep as a pickle file with open('test_nv_repr.pickle', 'wb') as handle: pickle.dump(save_rep, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(train_q1_file, train_q2_file, train_labels_file, dev_q1_file, dev_q2_file, dev_labels_file, embeddings_file, target_dir, hidden_size=128, dropout=0.5, num_classes=2, epochs=15, batch_size=64, lr=0.001, patience=5, max_grad_norm=10.0, gpu_index=0, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_q1 = np.load(train_q1_file) train_q2 = np.load(train_q2_file) train_labels = np.load(train_labels_file) # train_labels = label_transformer(train_labels) train_data = {"q1": train_q1, "q2": train_q2, "labels": train_labels} train_data = QQPDataset(train_data) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_q1 = np.load(dev_q1_file) dev_q2 = np.load(dev_q2_file) dev_labels = np.load(dev_labels_file) # dev_labels = label_transformer(dev_labels) dev_data = {"q1": dev_q1, "q2": dev_q2, "labels": dev_labels} dev_data = QQPDataset(dev_data) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") embeddings = torch.tensor(np.load(embeddings_file), dtype=torch.float).to(device) model = ESIM(embeddings.shape[0], embeddings.shape[1], hidden_size, embeddings=embeddings, dropout=dropout, num_classes=num_classes, device=device).to(device) # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training ESIM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # Save the best model. The optimizer is not saved to avoid having # a checkpoint file that is too heavy to be shared. To resume # training from the best model, use the 'esim_*.pth.tar' # checkpoints instead. torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) # Save the model at each epoch. torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break