Exemplo n.º 1
0
def main(args):
    assert torch.cuda.is_available(), "need to use GPUs"

    use_cuda = torch.cuda.is_available()
    cuda_devices = list(map(int, args.cuda_devices.split(",")))
    is_multigpu = len(cuda_devices) > 1
    device = "cuda"

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    if is_multigpu > 1:
        torch.cuda.manual_seed_all(args.seed)

    data = torch.load(args.data)
    dataset = BERTDataSet(data['word'], data['max_len'], data["dict"],
                          args.batch_size * args.steps)
    training_data = DataLoader(dataset,
                               batch_size=args.batch_size,
                               num_workers=args.num_cpus)

    model = BERT(dataset.word_size, data["max_len"], args.n_stack_layers,
                 args.d_model, args.d_ff, args.n_head, args.dropout)

    print(
        f"BERT have {sum(x.numel() for x in model.parameters())} paramerters in total"
    )

    optimizer = ScheduledOptim(
        torch.nn.DataParallel(
            torch.optim.Adam(model.get_trainable_parameters(),
                             lr=args.lr,
                             betas=(0.9, 0.999),
                             eps=1e-09,
                             weight_decay=0.01),
            device_ids=cuda_devices), args.d_model, args.n_warmup_steps)

    w_criterion = WordCrossEntropy()
    w_criterion = w_criterion.to(device)

    s_criterion = torch.nn.CrossEntropyLoss()

    model = model.to(device)
    model = torch.nn.DataParallel(model, device_ids=cuda_devices)
    model.train()
    for step, datas in enumerate(training_data):
        inp, pos, sent_label, word_label, segment_label = list(
            map(lambda x: x.to(device), datas))
        sent_label = sent_label.view(-1)
        optimizer.zero_grad()
        word, sent = model(inp, pos, segment_label)
        w_loss, w_corrects, tgt_sum = w_criterion(word, word_label)
        s_loss = s_criterion(sent, sent_label)
        if is_multigpu:
            w_loss, s_loss = w_loss.mean(), s_loss.mean()
        loss = w_loss + s_loss
        loss.backward()
        optimizer.step()
        s_corrects = (torch.max(sent, 1)[1].data == sent_label.data).sum()

        print(
            f"[Step {step+1}/{args.steps}] [word_loss: {w_loss:.5f}, sent_loss: {s_loss:.5f}, loss: {loss:.5f}, w_pre: {w_corrects/tgt_sum*100:.2f}% {w_corrects}/{tgt_sum}, s_pre: {float(s_corrects)/args.batch_size*100:.2f}% {s_corrects}/{args.batch_size}]"
        )

        if tf is not None:
            add_summary_value("Word loss", w_loss, step)
            add_summary_value("Sent loss", s_loss, step)
            add_summary_value("Loss", loss, step)
            add_summary_value("Word predict", w_corrects / tgt_sum, step)
            add_summary_value("Sent predict",
                              float(s_corrects) / args.batch_size, step)
            tf_summary_writer.flush()
Exemplo n.º 2
0
            total += 1
            if preds[j] == target[j]:
                total_correct += 1

    return total_correct/total


if __name__ == '__main__':
    mnli = BERTMNLI(TRAIN_DATA_DIR, bert_type=BERT_TYPE)
    match = BERTMNLI(MATCH_DATA_DIR, bert_type=BERT_TYPE)
    mismatch = BERTMNLI(MISMATCH_DATA_DIR, bert_type=BERT_TYPE)

    checkpoint = torch.load('storage/bert-base-dnli.pt')
    model = BERT(bert_type=BERT_TYPE)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    ###

    optimizer = Adam(model.parameters(), lr = LEARNING_RATE)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    criterion = nn.CrossEntropyLoss()

    best_acc = 0

    for epoch in range(1, NUM_EPOCHS+1):
        train_loss = train(mnli, model, criterion, optimizer, device)
        match_acc = eval(match, model, device)
        mismatch_acc= eval(mismatch, model, device)
       # print(f'Epoch {epoch}')
        print(f'Epoch {epoch}, Train Loss: {train_loss}, Match Acc: {match_acc}, Mismatch Acc:{mismatch_acc}')
Exemplo n.º 3
0
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
bert_model = BertModel.from_pretrained("bert-base-uncased")
# tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1', do_lower_case=True)
# bert_model = AlbertModel.from_pretrained("albert-base-v1")

model = BERT(2, bert_model)
model = model.to(device)

train_dataloader, validation_dataloader, test_dataloader = get_baseline_dataloader(
    args.data_file, args.batch_size, tokenizer)

optimizer = AdamW(model.parameters(), lr=args.lr)
total_steps = len(train_dataloader) * args.epochs
if new_version:
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        #warmup_steps = 0, # Default value in run_glue.py
        num_training_steps=total_steps)
    #t_total = total_steps)
else:
    scheduler = get_linear_schedule_with_warmup(