示例#1
0
def main():
    Config = config.get_args()
    set_seed(Config.seed)
    word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path)

    train_data = CommentDataSet(Config.train_path, word2ix, ix2word)
    train_loader = DataLoader(
        train_data,
        batch_size=16,
        shuffle=True,
        num_workers=0,
        collate_fn=mycollate_fn,
    )
    validation_data = CommentDataSet(Config.validation_path, word2ix, ix2word)
    validation_loader = DataLoader(
        validation_data,
        batch_size=16,
        shuffle=True,
        num_workers=0,
        collate_fn=mycollate_fn,
    )
    test_data = CommentDataSet(Config.test_path, word2ix, ix2word)
    test_loader = DataLoader(
        test_data,
        batch_size=16,
        shuffle=False,
        num_workers=0,
        collate_fn=mycollate_fn,
    )

    weight = pre_weight(len(word2ix), Config.pred_word2vec_path,
                        Config.embedding_dim, word2ix, ix2word)

    model = SentimentModel(embedding_dim=Config.embedding_dim,
                           hidden_dim=Config.hidden_dim,
                           LSTM_layers=Config.LSTM_layers,
                           drop_prob=Config.drop_prob,
                           pre_weight=weight)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    optimizer = optim.Adam(model.parameters(), lr=Config.lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=10,
                                                gamma=0.1)  # 学习率调整
    criterion = nn.CrossEntropyLoss()

    # 因为使用tensorboard画图会产生很多日志文件,这里进行清空操作

    if os.path.exists(Config.tensorboard_path):
        shutil.rmtree(Config.tensorboard_path)
        os.mkdir(Config.tensorboard_path)

    for epoch in range(Config.epochs):
        train_loader = tqdm(train_loader)
        train_loader.set_description(
            '[%s%04d/%04d %s%f]' %
            ('Epoch:', epoch + 1, Config.epochs, 'lr:', scheduler.get_lr()[0]))
        train(epoch, Config.epochs, train_loader, device, model, criterion,
              optimizer, scheduler, Config.tensorboard_path)
        validate(epoch, validation_loader, device, model, criterion,
                 Config.tensorboard_path)

    # 模型保存
    if os.path.exists(Config.model_save_path) == False:
        os.mkdir('./modelDict/')
    torch.save(model.state_dict(), Config.model_save_path)

    confuse_meter = ConfuseMeter()
    confuse_meter = test(test_loader, device, model, criterion)
示例#2
0
            loss = criterion(predictions, batch.label.float())
            
            acc = binary_accuracy(predictions, batch.label)
        
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator),  epoch_acc / len(iterator)

N_epoches = 5

best_valid_loss = float('inf')

for epoch in range(N_epoches):
        
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Sentiment-model.pt')
        
    print(f'Epoch:  {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain  Loss: {train_loss: .3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tValid  Loss: {valid_loss: .3f} | Valid Acc: {valid_acc*100:.2f}%')

model.load_state_dict(torch.load('Sentiment-model.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
示例#3
0
def preprocess_and_train():
    # read dataset
    data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
    data.columns = ('target','uid', 'time', 'query', 'user', 'text')

    # create new dataframe
    sent_df = pd.DataFrame(None, columns=('target', 'text'))
    sent_df['target'] = data['target']
    sent_df['text'] = data['text'].apply(preprocess_text)
    sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split()))

    # select random sample of 400,000 tweets from total dataset (training on a smaller dataset)
    sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED)
    sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED))

    # split dataset into train, test, validation set
    train, test = train_test_split(sent_df_sample, test_size=0.1)
    train, val = train_test_split(train, test_size=0.05)

    # create necessary dataloaders, for advantage of batching by pytorch
    train_dl = SentimentDL(train)
    val_dl = SentimentDL(val)
    test_dl = SentimentDL(test)

    train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True)
    validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True)

    # select the cuda device if available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # create model object
    model = SentimentModel()
    model.to(device)

    # ready with optimizer and scheduler objects 

    # do not apply weight decay in AdamW  to, bias layer and normalization terms
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']  # taken from https://huggingface.co/transformers/training.html 
    # more named parameteres in model.named_parameters()
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # optim = AdamW(model.parameters(), lr=5e-5)
    optim = AdamW(optimizer_grouped_parameters, lr=5e-5)

    # learning rate scheduling
    num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS)
    num_warmup_steps = int(0.05*num_train_steps)
    scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps)

    # Training : done on the basis of attaining better F1 score on the validation dataset
    
    scores = []
    min_f1 = 0
    
    for epoch in range(SentConfig.EPOCHS):
        _ = train_function(train_loader, model, optim, scheduler, device)
        _, results = evaluation_function(validation_loader, model, device)
        
        validation_f1 = round(f1_score(results[:,1], results[:,0]),4)
        accuracy = round(accuracy_score(results[:,1], results[:,0]), 4)
        
        scores.append((validation_f1, accuracy))
        print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy)
        if validation_f1 > min_f1:
            # save  model if validation f1 score is 
            torch.save(model.state_dict(), "SentimentModel.bin")
            # update max loss
            min_f1 =  validation_f1

    # plotting scores

    scores = np.array(scores)
    fig, ax = plt.subplots(1, 2, figsize=(14,6))
    ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r')
    ax[1].plot(range(SentConfig.EPOCHS), scores[:,1])
    ax[0].set(xlabel='Epoch num', ylabel='F1 Score')
    ax[1].set(xlabel='Epoch num', ylabel='Accuracy')
    ax[0].set_title('validation set f1 score at each epoch')
    ax[1].set_title('validation set accuracy at each apoch')

    # F1 score calculation on test predictions

    state_dict_ = torch.load('SentimentModel.bin')
    model = SentimentModel()
    model.load_state_dict(state_dict_)
    model.to(device)
    
    _, results = evaluation_function(test_loader, model, device, inference=True)
    print(classification_report(results[:,1], results[:,0]))
    print(round(accuracy_score(results[:,1], results[:,0]),4))