def main(): Config = config.get_args() set_seed(Config.seed) word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path) test_data = CommentDataSet(Config.test_path, word2ix, ix2word) test_loader = DataLoader( test_data, batch_size=16, shuffle=False, num_workers=0, collate_fn=mycollate_fn, ) weight = torch.zeros(len(word2ix), Config.embedding_dim) model = SentimentModel(embedding_dim=Config.embedding_dim, hidden_dim=Config.hidden_dim, LSTM_layers=Config.LSTM_layers, drop_prob=Config.drop_prob, pre_weight=weight) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") criterion = nn.CrossEntropyLoss() model.load_state_dict(torch.load(Config.model_save_path), strict=True) # 模型加载 confuse_meter = ConfuseMeter() confuse_meter = test(test_loader, device, model, criterion)
def main(): Config = config.get_args() set_seed(Config.seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path) weight = torch.zeros(len(word2ix), Config.embedding_dim) model = SentimentModel(embedding_dim=Config.embedding_dim, hidden_dim=Config.hidden_dim, LSTM_layers=Config.LSTM_layers, drop_prob=Config.drop_prob, pre_weight=weight) model.load_state_dict(torch.load(Config.model_save_path), strict=True) # 模型加载 # comment_str = "忘不掉的一句台词,是杜邦公司笑着对男主说:“Sue me”。我记得前段时间某件事,也是同样的说辞,“欢迎来起诉中华有为”。也是同样的跋扈。若干年后,会看到改编的电影吗。" result = predict(Config.comment_str, model, device, word2ix) print(Config.comment_str, result)
loss = criterion(predictions, batch.label.float()) acc = binary_accuracy(predictions, batch.label) epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) N_epoches = 5 best_valid_loss = float('inf') for epoch in range(N_epoches): train_loss, train_acc = train(model, train_iter, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iter, criterion) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'Sentiment-model.pt') print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss: .3f} | Train Acc: {train_acc*100:.2f}%') print(f'\tValid Loss: {valid_loss: .3f} | Valid Acc: {valid_acc*100:.2f}%') model.load_state_dict(torch.load('Sentiment-model.pt')) test_loss, test_acc = evaluate(model, test_iter, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
def predict(): # kfold type of data input data = pd.read_csv(config.TEST_FILE) data['Label_encoded'] = 0 data['Sentiment_encoded'] = 0 df_test = data test_data = CommentData(comments=df_test['Comment'], labels=df_test['Label_encoded'], sentiments=df_test['Sentiment_encoded']) test_dataloader = torch.utils.data.DataLoader( test_data, batch_size=config.TEST_BATCH_SIZE, # num_workers = 4 ) # model device = torch.device('cuda') model_config = BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model0 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model0.to(device) # model0 = nn.DataParallel(model0) model0.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_0.bin')) model0.eval() model1 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model1.to(device) # model1 = nn.DataParallel(model1) model1.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_1.bin')) model1.eval() model2 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model2.to(device) # model2 = nn.DataParallel(model2) model2.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_2.bin')) model2.eval() model3 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model3.to(device) # model3 = nn.DataParallel(model3) model3.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_3.bin')) model3.eval() model4 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model4.to(device) # model4 = nn.DataParallel(model4) model4.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_4.bin')) model4.eval() # process raw output model_prediction = [] with torch.no_grad(): tq0 = tqdm(test_dataloader, total=len(test_dataloader)) for bi, data in tqdm(enumerate(tq0)): # load data / ready to input input_ids = data['input_ids'] token_type_ids = data['token_type_ids'] attention_mask = data['attention_mask'] label = data['label'] sentiment = data['sentiment'] # prepare input data input_ids = input_ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) attention_mask = attention_mask.to(device, dtype=torch.long) label = label.to(device, dtype=torch.long) sentiment = sentiment.to(device, dtype=torch.long) # forward(self, ids, mask, type_ids) out0 = model0(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out1 = model1(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out2 = model2(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out3 = model3(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out4 = model4(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out = (out0 + out1 + out2 + out3 + out4) / 5 out = torch.softmax(out, dim=1).cpu().detach().numpy() for ix, result in enumerate(out): pred = np.argmax(result) model_prediction.append(pred) sample = pd.read_csv(config.TEST_FILE) sample['sentiment_pred'] = model_prediction sample.to_csv(config.OUTPUT_PATH + '/pred_sentiment.csv', index=False)
def preprocess_and_train(): # read dataset data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None) data.columns = ('target','uid', 'time', 'query', 'user', 'text') # create new dataframe sent_df = pd.DataFrame(None, columns=('target', 'text')) sent_df['target'] = data['target'] sent_df['text'] = data['text'].apply(preprocess_text) sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split())) # select random sample of 400,000 tweets from total dataset (training on a smaller dataset) sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED) sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED)) # split dataset into train, test, validation set train, test = train_test_split(sent_df_sample, test_size=0.1) train, val = train_test_split(train, test_size=0.05) # create necessary dataloaders, for advantage of batching by pytorch train_dl = SentimentDL(train) val_dl = SentimentDL(val) test_dl = SentimentDL(test) train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True) validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) # select the cuda device if available device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # create model object model = SentimentModel() model.to(device) # ready with optimizer and scheduler objects # do not apply weight decay in AdamW to, bias layer and normalization terms no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] # taken from https://huggingface.co/transformers/training.html # more named parameteres in model.named_parameters() optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # optim = AdamW(model.parameters(), lr=5e-5) optim = AdamW(optimizer_grouped_parameters, lr=5e-5) # learning rate scheduling num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS) num_warmup_steps = int(0.05*num_train_steps) scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps) # Training : done on the basis of attaining better F1 score on the validation dataset scores = [] min_f1 = 0 for epoch in range(SentConfig.EPOCHS): _ = train_function(train_loader, model, optim, scheduler, device) _, results = evaluation_function(validation_loader, model, device) validation_f1 = round(f1_score(results[:,1], results[:,0]),4) accuracy = round(accuracy_score(results[:,1], results[:,0]), 4) scores.append((validation_f1, accuracy)) print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy) if validation_f1 > min_f1: # save model if validation f1 score is torch.save(model.state_dict(), "SentimentModel.bin") # update max loss min_f1 = validation_f1 # plotting scores scores = np.array(scores) fig, ax = plt.subplots(1, 2, figsize=(14,6)) ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r') ax[1].plot(range(SentConfig.EPOCHS), scores[:,1]) ax[0].set(xlabel='Epoch num', ylabel='F1 Score') ax[1].set(xlabel='Epoch num', ylabel='Accuracy') ax[0].set_title('validation set f1 score at each epoch') ax[1].set_title('validation set accuracy at each apoch') # F1 score calculation on test predictions state_dict_ = torch.load('SentimentModel.bin') model = SentimentModel() model.load_state_dict(state_dict_) model.to(device) _, results = evaluation_function(test_loader, model, device, inference=True) print(classification_report(results[:,1], results[:,0])) print(round(accuracy_score(results[:,1], results[:,0]),4))