예제 #1
0
# neg_train_data_loader = DataLoader(
#     neg_train_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate)

# test_data_loader = DataLoader(
#     test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate)  # \
# neg_test_data_loader = DataLoader(
#     neg_test_dataset, batch_size=args.batch_size * args.duplicate, num_workers=args.num_workers, collate_fn=my_collate)  # \
# labeled_data_loader = DataLoader(
#     labeled_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=my_collate)
# if test_dataset is not None else None
# assert False
# dataset is not None else None
# assert False
print("Building BERT model")
bert = BERT(len(vocab),
            hidden=args.hidden,
            n_layers=args.layers,
            attn_heads=args.attn_heads)

print("Creating BERT Trainer")
# trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader,
#                       lr=args.lr, betas=(
#                           args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
#                       with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, pad_index=vocab.pad_index)
temp_data_loader = DataLoader(temp_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              collate_fn=my_collate)
trainer = ReconstructionBERTTrainer(bert,
                                    len(vocab),
                                    len(markdown_vocab),
                                    args.markdown_emb_size,
예제 #2
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        required=True,
                        type=str,
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--test_dataset",
                        type=str,
                        default=None,
                        help="test set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="ex)output/bert.model")

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=20,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=5,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")
    parser.add_argument(
        "--multi_segment",
        type=bool,
        default=False,
        help="whether use multiple segment_labels for entity types")
    parser.add_argument("--sep_label",
                        type=bool,
                        default=False,
                        help="whether to insert <sep>")

    args = parser.parse_args()

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory,
                                multi_segment=args.multi_segment,
                                sep=args.sep_label)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len,
                               on_memory=args.on_memory, multi_segment=args.multi_segment,
                               sep=args.sep_label) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    print("Building BERT model")
    bert = BERT(len(vocab),
                hidden=args.hidden,
                n_layers=args.layers,
                attn_heads=args.attn_heads)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert,
                          len(vocab),
                          train_dataloader=train_data_loader,
                          test_dataloader=test_data_loader,
                          lr=args.lr,
                          betas=(args.adam_beta1, args.adam_beta2),
                          weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          log_freq=args.log_freq)

    print("Training Start")
    for epoch in range(args.epochs):
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            trainer.test(epoch)
예제 #3
0
    format = 'csv',
    skip_header = True,
    fields = rev_field
)

novel = TabularDataset(
    path = data_novelty_csv_path,
    format = 'csv',
    skip_header = True,
    fields = nov_field
)

review_iter = Iterator(review, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False)
novel_iter = Iterator(novel, batch_size=1, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False)

model = BERT(feature_len).to(device)

print("Computing deep features...")

review_features = []
for x in tqdm(review_iter):
    text = x.comment_text.type(torch.LongTensor)
    text = text.to(device)
    feature = model(text)
    review_features.append(feature.detach().cpu().numpy())
review_features = np.vstack(review_features)
print(review_features.shape)

novel_features = []
for x in tqdm(novel_iter):
    text = x.novel.type(torch.LongTensor)
def get_models():
    from model import BERT
    return {'BERT': BERT()}
예제 #5
0
def main(args):
    assert torch.cuda.is_available(), "need to use GPUs"

    use_cuda = torch.cuda.is_available()
    cuda_devices = list(map(int, args.cuda_devices.split(",")))
    is_multigpu = len(cuda_devices) > 1
    device = "cuda"

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    if is_multigpu > 1:
        torch.cuda.manual_seed_all(args.seed)

    data = torch.load(args.data)
    dataset = BERTDataSet(data['word'], data['max_len'], data["dict"],
                          args.batch_size * args.steps)
    training_data = DataLoader(dataset,
                               batch_size=args.batch_size,
                               num_workers=args.num_cpus)

    model = BERT(dataset.word_size, data["max_len"], args.n_stack_layers,
                 args.d_model, args.d_ff, args.n_head, args.dropout)

    print(
        f"BERT have {sum(x.numel() for x in model.parameters())} paramerters in total"
    )

    optimizer = ScheduledOptim(
        torch.nn.DataParallel(
            torch.optim.Adam(model.get_trainable_parameters(),
                             lr=args.lr,
                             betas=(0.9, 0.999),
                             eps=1e-09,
                             weight_decay=0.01),
            device_ids=cuda_devices), args.d_model, args.n_warmup_steps)

    w_criterion = WordCrossEntropy()
    w_criterion = w_criterion.to(device)

    s_criterion = torch.nn.CrossEntropyLoss()

    model = model.to(device)
    model = torch.nn.DataParallel(model, device_ids=cuda_devices)
    model.train()
    for step, datas in enumerate(training_data):
        inp, pos, sent_label, word_label, segment_label = list(
            map(lambda x: x.to(device), datas))
        sent_label = sent_label.view(-1)
        optimizer.zero_grad()
        word, sent = model(inp, pos, segment_label)
        w_loss, w_corrects, tgt_sum = w_criterion(word, word_label)
        s_loss = s_criterion(sent, sent_label)
        if is_multigpu:
            w_loss, s_loss = w_loss.mean(), s_loss.mean()
        loss = w_loss + s_loss
        loss.backward()
        optimizer.step()
        s_corrects = (torch.max(sent, 1)[1].data == sent_label.data).sum()

        print(
            f"[Step {step+1}/{args.steps}] [word_loss: {w_loss:.5f}, sent_loss: {s_loss:.5f}, loss: {loss:.5f}, w_pre: {w_corrects/tgt_sum*100:.2f}% {w_corrects}/{tgt_sum}, s_pre: {float(s_corrects)/args.batch_size*100:.2f}% {s_corrects}/{args.batch_size}]"
        )

        if tf is not None:
            add_summary_value("Word loss", w_loss, step)
            add_summary_value("Sent loss", s_loss, step)
            add_summary_value("Loss", loss, step)
            add_summary_value("Word predict", w_corrects / tgt_sum, step)
            add_summary_value("Sent predict",
                              float(s_corrects) / args.batch_size, step)
            tf_summary_writer.flush()
        preds = output.argmax(dim=1)
        for j in range(len(preds)):
            total += 1
            if preds[j] == target[j]:
                total_correct += 1

    return total_correct / total


if __name__ == '__main__':
    mnli = BERTMNLI(TRAIN_DATA_DIR, bert_type=BERT_TYPE)
    match = BERTMNLI(MATCH_DATA_DIR, bert_type=BERT_TYPE)
    mismatch = BERTMNLI(MISMATCH_DATA_DIR, bert_type=BERT_TYPE)

    checkpoint = torch.load('storage/bert-base-msrp.pt')
    model = BERT(bert_type=BERT_TYPE)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    ###

    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    criterion = nn.CrossEntropyLoss()

    best_acc = 0

    for epoch in range(1, NUM_EPOCHS + 1):
        train_loss = train(mnli, model, criterion, optimizer, device)
        match_acc = eval(match, model, device)
        mismatch_acc = eval(mismatch, model, device)
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        required=True,
                        type=str,
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--test_dataset",
                        type=str,
                        default=None,
                        help="test set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="ex)output/bert.model")

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=512,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=8,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=500,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")
    parser.add_argument("-restore_model_path",
                        "--restore_model_path",
                        type=str,
                        default=None,
                        help="trained model path")
    parser.add_argument("-restart_epoch",
                        "--restart_epoch",
                        type=int,
                        default=0,
                        help="restart epoch")

    args = parser.parse_args()

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    print("Building BERT model")
    bert = BERT(len(vocab),
                hidden=args.hidden,
                n_layers=args.layers,
                attn_heads=args.attn_heads)

    print("Creating BERT Trainer")

    #import pdb;pdb.set_trace()

    trainer = BERTTrainer(bert,
                          len(vocab),
                          train_dataloader=train_data_loader,
                          test_dataloader=test_data_loader,
                          lr=args.lr,
                          betas=(args.adam_beta1, args.adam_beta2),
                          weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          log_freq=args.log_freq,
                          restore_model_path=args.restore_model_path)

    print("Training Start")
    loss_epoch_train = {}
    loss_epoch_test = {}
    best_loss = 1e10
    for epoch in range(args.epochs + args.restart_epoch):
        loss_train = trainer.train(epoch)
        loss_epoch_train[epoch] = loss_train

        if loss_train < best_loss:
            best_loss = loss_train
            trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            loss_test = trainer.test(epoch)
            loss_epoch_test[epoch] = loss_test
    print('to get embedings use bert_pytorch/trainer/extract_embeddings.py')
    #trainer.extract(0)
    print(loss_epoch_train, loss_epoch_test)
예제 #8
0
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
bert_model = BertModel.from_pretrained("bert-base-uncased")
# tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1', do_lower_case=True)
# bert_model = AlbertModel.from_pretrained("albert-base-v1")

model = BERT(2, bert_model)
model = model.to(device)

train_dataloader, validation_dataloader, test_dataloader = get_baseline_dataloader(
    args.data_file, args.batch_size, tokenizer)

optimizer = AdamW(model.parameters(), lr=args.lr)
total_steps = len(train_dataloader) * args.epochs
if new_version:
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        #warmup_steps = 0, # Default value in run_glue.py
        num_training_steps=total_steps)
    #t_total = total_steps)
else: