Пример #1
0
train_df = train_df.drop(['comment_text'], axis=1)
train_df['target'] = (train_df['target'] >= 0.5).astype(float)

valid_df = valid_df.fillna(0)
valid_df = valid_df.drop(['comment_text'], axis=1)
valid_df['target'] = (valid_df['toxic'] == 1) | (valid_df['severe_toxic'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['obscene'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['threat'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['insult'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['identity_hate'] == 1)
valid_df['target'] = valid_df['target'].astype(float)

model = BertForSequenceClassification(bert_config, num_labels=1)
model.load_state_dict(torch.load("./datas/bert_pytorch.bin"))
model.to(device)
for param in model.parameters():
    param.requires_grad = False

X = train_seqs[:]
y = train_df['target'].values[:]
valid_X = valid_seqs[:]
valid_y = valid_df['target'].values[:]
X = np.concatenate((X, valid_X), axis=1)
y = np.concatenate((y, valid_y), axis=0)

train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.float))

output_model_file = "./datas/mybert.bin"
lr = 2e-5
batch_size = 32
                 torch.float)).item() / len(train_loader)
    tq.set_postfix(avg_loss=avg_loss, avg_accuracy=avg_accuracy)
    torch.save(model.state_dict(),
               output_model_file + '_epoch_' + str(epoch) + '.bin')

    #validate
    test_model = BertForSequenceClassification(bert_config,
                                               num_labels=len(y_columns))

    #paralleism
    test_model = nn.DataParallel(test_model)

    test_model.load_state_dict(
        torch.load(output_model_file + '_epoch_' + str(epoch) + '.bin'))
    test_model.to(device)
    for param in test_model.parameters():
        param.requires_grad = False
    test_model.eval()
    valid_preds = np.zeros((len(X_val)))
    print(valid_preds.size)
    valid = torch.utils.data.TensorDataset(
        torch.tensor(X_val, dtype=torch.long))
    valid_loader = torch.utils.data.DataLoader(valid,
                                               batch_size=256,
                                               shuffle=False)

    tk0 = tqdm(valid_loader)
    for i, (x_batch, ) in enumerate(tk0):
        pred = test_model(x_batch.to(device),
                          attention_mask=(x_batch > 0).to(device),
                          labels=None)
Пример #3
0
def train_unfixed():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt", cf.max_seq_len)
    train_dataloader = DataLoader(train_data,
                                  batch_size=cf.batch_size,
                                  shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len)
    test_dataloader = DataLoader(test_data,
                                 batch_size=cf.batch_size,
                                 shuffle=True)

    # 模型
    config = BertConfig("./output/pytorch_bert_config.json")
    model = BertForSequenceClassification(config, num_labels=cf.num_labels)
    model.load_state_dict(torch.load("./output/pytorch_model.bin"))

    # 优化器用adam
    for param in model.parameters():
        param.requires_grad = True
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(
        len(train_data) / cf.batch_size) * cf.epoch
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=cf.lr,
                         t_total=num_train_optimization_steps)

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1500  # 如果超过1500轮未提升,提前结束训练

    # 获取当前验证集acc
    model.eval()
    _, best_acc_val = evaluate(model, test_dataloader, device)

    flag = False
    model.train()
    for epoch_id in range(cf.epoch):
        print("Epoch %d" % epoch_id)
        for step, batch in enumerate(
                tqdm(train_dataloader,
                     desc="batch",
                     total=len(train_dataloader))):
            # for step,batch in enumerate(train_dataloader):

            label_id = batch['label_id'].squeeze(1).to(device)
            word_ids = batch['word_ids'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            word_mask = batch['word_mask'].to(device)

            loss = model(word_ids, segment_ids, word_mask, label_id)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train, acc_train = get_model_loss_acc(
                        model, word_ids, segment_ids, word_mask, label_id)
                loss_val, acc_val = evaluate(model, test_dataloader, device)

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch

                    torch.save(model.state_dict(),
                               "./output/pytorch_model.bin")
                    with open("./output/pytorch_bert_config.json", 'w') as f:
                        f.write(model.config.to_json_string())

                    improved_str = "*"
                else:
                    improved_str = ""

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(
                    msg.format(total_batch, loss_train, acc_train, loss_val,
                               acc_val, time_dif, improved_str))

                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break