def main(args): assert torch.cuda.is_available(), "need to use GPUs" use_cuda = torch.cuda.is_available() cuda_devices = list(map(int, args.cuda_devices.split(","))) is_multigpu = len(cuda_devices) > 1 device = "cuda" random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed(args.seed) if is_multigpu > 1: torch.cuda.manual_seed_all(args.seed) data = torch.load(args.data) dataset = BERTDataSet(data['word'], data['max_len'], data["dict"], args.batch_size * args.steps) training_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_cpus) model = BERT(dataset.word_size, data["max_len"], args.n_stack_layers, args.d_model, args.d_ff, args.n_head, args.dropout) print( f"BERT have {sum(x.numel() for x in model.parameters())} paramerters in total" ) optimizer = ScheduledOptim( torch.nn.DataParallel( torch.optim.Adam(model.get_trainable_parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-09, weight_decay=0.01), device_ids=cuda_devices), args.d_model, args.n_warmup_steps) w_criterion = WordCrossEntropy() w_criterion = w_criterion.to(device) s_criterion = torch.nn.CrossEntropyLoss() model = model.to(device) model = torch.nn.DataParallel(model, device_ids=cuda_devices) model.train() for step, datas in enumerate(training_data): inp, pos, sent_label, word_label, segment_label = list( map(lambda x: x.to(device), datas)) sent_label = sent_label.view(-1) optimizer.zero_grad() word, sent = model(inp, pos, segment_label) w_loss, w_corrects, tgt_sum = w_criterion(word, word_label) s_loss = s_criterion(sent, sent_label) if is_multigpu: w_loss, s_loss = w_loss.mean(), s_loss.mean() loss = w_loss + s_loss loss.backward() optimizer.step() s_corrects = (torch.max(sent, 1)[1].data == sent_label.data).sum() print( f"[Step {step+1}/{args.steps}] [word_loss: {w_loss:.5f}, sent_loss: {s_loss:.5f}, loss: {loss:.5f}, w_pre: {w_corrects/tgt_sum*100:.2f}% {w_corrects}/{tgt_sum}, s_pre: {float(s_corrects)/args.batch_size*100:.2f}% {s_corrects}/{args.batch_size}]" ) if tf is not None: add_summary_value("Word loss", w_loss, step) add_summary_value("Sent loss", s_loss, step) add_summary_value("Loss", loss, step) add_summary_value("Word predict", w_corrects / tgt_sum, step) add_summary_value("Sent predict", float(s_corrects) / args.batch_size, step) tf_summary_writer.flush()
def run_training_bert(args, dataset, train_loader, val_loader, vocab_size): checkpoint_path = os.path.join(args.checkpoint_path, args.checkpoint) device = torch.device("cuda:" + args.device if torch.cuda.is_available() else "cpu") model = BERT().to(device) # Initialize BCELoss function # criterion = nn.BCEWithLogitsLoss() # Setup Adam optimizers for both G and D optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) model.train() # turn on training mode # Training Loop print("Starting Training Loop...") # For each epoch for epoch in range(args.epochs): # For each batch in the dataloader losses = [] running_corrects = 0 for i, batch in enumerate(train_loader): # format batch text, context, label = batch.text, batch.context, batch.label # print(text.tolist()[0]) # print(label.tolist()[0]) label = label.type(torch.LongTensor).to(device) text = text.type(torch.LongTensor).to(device) output = model(text, label) loss, _ = output optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) epoch_loss = sum(losses) / len(losses) print('Epoch: {}, Training Loss: {:.4f}'.format(epoch, epoch_loss)) # save model if epoch % 1 == 0 or epoch == args.epochs - 1: torch.save( { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'vocab_size': vocab_size, 'args': vars(args) }, checkpoint_path) if args.eval: model.eval() with torch.no_grad(): preds = [] labels = [] eval_losses = [] for i, batch in enumerate(val_loader if val_loader is not None else train_loader): text, context, label = batch.text, batch.context, batch.label label = label.type(torch.LongTensor).to(device) text = text.type(torch.LongTensor).to(device) output = model(text, label) loss, output = output pred = torch.argmax(output, 1).tolist() preds.extend(pred) labels.extend(label.tolist()) eval_losses.append(loss.item()) print("{} Precision: {}, Recall: {}, F1: {}, Loss: {}". format( "Train" if val_loader is None else "Valid", sklearn.metrics.precision_score( np.array(labels).astype('int32'), np.array(preds)), sklearn.metrics.recall_score( np.array(labels).astype('int32'), np.array(preds)), sklearn.metrics.f1_score( np.array(labels).astype('int32'), np.array(preds)), np.average(eval_losses)))
num_training_steps=total_steps) #t_total = total_steps) else: scheduler = get_linear_schedule_with_warmup( optimizer, # num_warmup_steps = 0, warmup_steps=int(0.1 * total_steps), # Default value in run_glue.py # num_training_steps = total_steps) t_total=total_steps) loss_values = [] best_eval_acc = 0 test_acc = 0 for epoch_i in range(0, args.epochs): total_loss = 0 model.train() # For each batch of training data... for step, batch in tqdm(enumerate(train_dataloader)): model.train() b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) outputs = model(b_input_ids, b_input_mask, b_labels) loss = outputs[0] loss.backward() total_loss += loss.item()