def main(): Config = config.get_args() set_seed(Config.seed) word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path) train_data = CommentDataSet(Config.train_path, word2ix, ix2word) train_loader = DataLoader( train_data, batch_size=16, shuffle=True, num_workers=0, collate_fn=mycollate_fn, ) validation_data = CommentDataSet(Config.validation_path, word2ix, ix2word) validation_loader = DataLoader( validation_data, batch_size=16, shuffle=True, num_workers=0, collate_fn=mycollate_fn, ) test_data = CommentDataSet(Config.test_path, word2ix, ix2word) test_loader = DataLoader( test_data, batch_size=16, shuffle=False, num_workers=0, collate_fn=mycollate_fn, ) weight = pre_weight(len(word2ix), Config.pred_word2vec_path, Config.embedding_dim, word2ix, ix2word) model = SentimentModel(embedding_dim=Config.embedding_dim, hidden_dim=Config.hidden_dim, LSTM_layers=Config.LSTM_layers, drop_prob=Config.drop_prob, pre_weight=weight) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") optimizer = optim.Adam(model.parameters(), lr=Config.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 学习率调整 criterion = nn.CrossEntropyLoss() # 因为使用tensorboard画图会产生很多日志文件,这里进行清空操作 if os.path.exists(Config.tensorboard_path): shutil.rmtree(Config.tensorboard_path) os.mkdir(Config.tensorboard_path) for epoch in range(Config.epochs): train_loader = tqdm(train_loader) train_loader.set_description( '[%s%04d/%04d %s%f]' % ('Epoch:', epoch + 1, Config.epochs, 'lr:', scheduler.get_lr()[0])) train(epoch, Config.epochs, train_loader, device, model, criterion, optimizer, scheduler, Config.tensorboard_path) validate(epoch, validation_loader, device, model, criterion, Config.tensorboard_path) # 模型保存 if os.path.exists(Config.model_save_path) == False: os.mkdir('./modelDict/') torch.save(model.state_dict(), Config.model_save_path) confuse_meter = ConfuseMeter() confuse_meter = test(test_loader, device, model, criterion)
loss = criterion(predictions, batch.label.float()) acc = binary_accuracy(predictions, batch.label) epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) N_epoches = 5 best_valid_loss = float('inf') for epoch in range(N_epoches): train_loss, train_acc = train(model, train_iter, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iter, criterion) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'Sentiment-model.pt') print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss: .3f} | Train Acc: {train_acc*100:.2f}%') print(f'\tValid Loss: {valid_loss: .3f} | Valid Acc: {valid_acc*100:.2f}%') model.load_state_dict(torch.load('Sentiment-model.pt')) test_loss, test_acc = evaluate(model, test_iter, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
def preprocess_and_train(): # read dataset data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None) data.columns = ('target','uid', 'time', 'query', 'user', 'text') # create new dataframe sent_df = pd.DataFrame(None, columns=('target', 'text')) sent_df['target'] = data['target'] sent_df['text'] = data['text'].apply(preprocess_text) sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split())) # select random sample of 400,000 tweets from total dataset (training on a smaller dataset) sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED) sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED)) # split dataset into train, test, validation set train, test = train_test_split(sent_df_sample, test_size=0.1) train, val = train_test_split(train, test_size=0.05) # create necessary dataloaders, for advantage of batching by pytorch train_dl = SentimentDL(train) val_dl = SentimentDL(val) test_dl = SentimentDL(test) train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True) validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) # select the cuda device if available device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # create model object model = SentimentModel() model.to(device) # ready with optimizer and scheduler objects # do not apply weight decay in AdamW to, bias layer and normalization terms no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] # taken from https://huggingface.co/transformers/training.html # more named parameteres in model.named_parameters() optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # optim = AdamW(model.parameters(), lr=5e-5) optim = AdamW(optimizer_grouped_parameters, lr=5e-5) # learning rate scheduling num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS) num_warmup_steps = int(0.05*num_train_steps) scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps) # Training : done on the basis of attaining better F1 score on the validation dataset scores = [] min_f1 = 0 for epoch in range(SentConfig.EPOCHS): _ = train_function(train_loader, model, optim, scheduler, device) _, results = evaluation_function(validation_loader, model, device) validation_f1 = round(f1_score(results[:,1], results[:,0]),4) accuracy = round(accuracy_score(results[:,1], results[:,0]), 4) scores.append((validation_f1, accuracy)) print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy) if validation_f1 > min_f1: # save model if validation f1 score is torch.save(model.state_dict(), "SentimentModel.bin") # update max loss min_f1 = validation_f1 # plotting scores scores = np.array(scores) fig, ax = plt.subplots(1, 2, figsize=(14,6)) ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r') ax[1].plot(range(SentConfig.EPOCHS), scores[:,1]) ax[0].set(xlabel='Epoch num', ylabel='F1 Score') ax[1].set(xlabel='Epoch num', ylabel='Accuracy') ax[0].set_title('validation set f1 score at each epoch') ax[1].set_title('validation set accuracy at each apoch') # F1 score calculation on test predictions state_dict_ = torch.load('SentimentModel.bin') model = SentimentModel() model.load_state_dict(state_dict_) model.to(device) _, results = evaluation_function(test_loader, model, device, inference=True) print(classification_report(results[:,1], results[:,0])) print(round(accuracy_score(results[:,1], results[:,0]),4))