def train(): glove_pretrained, dataloaders, dataset_sizes, tbl, tagset, reverse_tagset, tag_definitions = preprocess( ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = BiGRU(glove_pretrained, MODEL_PARAMS['gru_hidden_dim'], MODEL_PARAMS['gru_num_layers'], len(tagset), MODEL_PARAMS['concat']).to(device) criterion = nn.NLLLoss(ignore_index=-1) optimizer = optim.Adam(net.parameters(), lr=0.001) train_model(device, net, dataloaders, dataset_sizes, criterion, optimizer, MODEL_PARAMS['num_epochs']) test(device, net, dataloaders['testing']) torch.save(net.state_dict(), 'trained_model.pt')
matched = torch.mul(matched.type(torch.FloatTensor), masks) correct += matched.sum() total += masks.sum() # print("correct prediction: {}; total prediction: {}".format(correct, total)) # print("sum comparison:", masks.sum(), seq_len.sum()) accuracy = correct / total return accuracy rnn = BiGRU(input_size, hidden_size, num_layers, num_classes, batch_size) rnn = rnn.cuda() if use_cuda else rnn print("Model loaded!!!!!!!!") criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) for epoch in range(num_epochs): print("Epoch {} is running".format(epoch + 1)) for i, (name, features, masks, labels, seq_len) in enumerate(train_loader): features = Variable(features) labels = labels.view(-1) labels = Variable(labels.type(torch.LongTensor)) # print("labels size: {}".format(labels.size())) if use_cuda: features = features.cuda() labels = labesl.cuda() optimizer.zero_grad() outputs = rnn(features) # print("outputs size: {}".format(outputs.size()))
def train_model(args, train_text=None, train_labels=None, eval_text=None, eval_labels=None, tokenizer=None): textattack.shared.utils.set_seed(args.random_seed) _make_directories(args.output_dir) num_gpus = torch.cuda.device_count() # Save logger writes to file log_txt_path = os.path.join(args.output_dir, "log.txt") fh = logging.FileHandler(log_txt_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info(f"Writing logs to {log_txt_path}.") train_examples_len = len(train_text) # label_id_len = len(train_labels) label_set = set(train_labels) args.num_labels = len(label_set) logger.info( f"Loaded dataset. Found: {args.num_labels} labels: {sorted(label_set)}" ) if len(train_labels) != len(train_text): raise ValueError( f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})" ) if len(eval_labels) != len(eval_text): raise ValueError( f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})" ) if args.model == "gru": textattack.shared.logger.info( "Loading textattack model: GRUForClassification") model = BiGRU() model.to(device) elif args.model == "lstm": textattack.shared.logger.info( "Loading textattack model: LSTMForClassification") model = BiLSTM() model.to(device) # attack_class = attack_from_args(args) # We are adversarial training if the user specified an attack along with # the training args. # adversarial_training = (attack_class is not None) and (not args.check_robustness) # multi-gpu training if num_gpus > 1: model = torch.nn.DataParallel(model) logger.info("Using torch.nn.DataParallel.") logger.info(f"Training model across {num_gpus} GPUs") num_train_optimization_steps = ( int(train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs) if args.model == "lstm" or args.model == "cnn" or args.model == "gru": def need_grad(x): return x.requires_grad optimizer = torch.optim.Adam(filter(need_grad, model.parameters()), lr=args.learning_rate) scheduler = None else: param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = transformers.optimization.AdamW( optimizer_grouped_parameters, lr=args.learning_rate) scheduler = transformers.optimization.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps, ) # Start Tensorboard and log hyperparams. from torch.utils.tensorboard import SummaryWriter tb_writer = SummaryWriter(args.output_dir) # Use Weights & Biases, if enabled. if args.enable_wandb: global wandb wandb = textattack.shared.utils.LazyLoader("wandb", globals(), "wandb") wandb.init(sync_tensorboard=True) # Save original args to file args_save_path = os.path.join(args.output_dir, "train_args.json") _save_args(args, args_save_path) logger.info(f"Wrote original training args to {args_save_path}.") tb_writer.add_hparams( {k: v for k, v in vars(args).items() if _is_writable_type(v)}, {}) # Start training logger.info("***** Running training *****") # if augmenter: # logger.info(f"\tNum original examples = {train_examples_len}") # logger.info(f"\tNum examples after augmentation = {len(train_text)}") # else: # logger.info(f"\tNum examples = {train_examples_len}") logger.info(f"\tNum examples = {train_examples_len}") logger.info(f"\tBatch size = {args.batch_size}") logger.info(f"\tMax sequence length = {args.max_length}") logger.info(f"\tNum steps = {num_train_optimization_steps}") logger.info(f"\tNum epochs = {args.num_train_epochs}") logger.info(f"\tLearning rate = {args.learning_rate}") eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels, args.batch_size) train_dataloader = _make_dataloader(tokenizer, train_text, train_labels, args.batch_size) global_step = 0 tr_loss = 0 model.train() args.best_eval_score = 0 args.best_eval_score_epoch = 0 args.epochs_since_best_eval_score = 0 def loss_backward(loss): if num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() return loss # if args.do_regression: # # TODO integrate with textattack `metrics` package # loss_fct = torch.nn.MSELoss() # else: # loss_fct = torch.nn.CrossEntropyLoss() loss_fct = torch.nn.CrossEntropyLoss() for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch", position=0, leave=True): # if adversarial_training: # if epoch >= args.num_clean_epochs: # if (epoch - args.num_clean_epochs) % args.attack_period == 0: # # only generate a new adversarial training set every args.attack_period epochs # # after the clean epochs # logger.info("Attacking model to generate new training set...") # adv_attack_results = _generate_adversarial_examples( # model_wrapper, attack_class, list(zip(train_text, train_labels)) # ) # adv_train_text = [r.perturbed_text() for r in adv_attack_results] # train_dataloader = _make_dataloader( # tokenizer, adv_train_text, train_labels, args.batch_size # ) # else: # logger.info(f"Running clean epoch {epoch+1}/{args.num_clean_epochs}") prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration", position=0, leave=True) # Use these variables to track training accuracy during classification. correct_predictions = 0 total_predictions = 0 for step, batch in enumerate(prog_bar): ids1, ids2, msk1, msk2, labels = batch # input_ids, labels = batch labels = labels.to(device) # if isinstance(input_ids, dict): # ## dataloader collates dict backwards. This is a workaround to get # # ids in the right shape for HuggingFace models # input_ids = { # k: torch.stack(v).T.to(device) for k, v in input_ids.items() # } # logits = model(**input_ids)[0] # else: ids1 = ids1.to(device) ids2 = ids2.to(device) msk1 = msk1.to(device) msk2 = msk2.to(device) logits = model(ids1, ids2, msk1, msk2) # if args.do_regression: # # TODO integrate with textattack `metrics` package # loss = loss_fct(logits.squeeze(), labels.squeeze()) # else: loss = loss_fct(logits, labels) pred_labels = logits.argmax(dim=-1) correct_predictions += (pred_labels == labels).sum().item() total_predictions += len(pred_labels) loss = loss_backward(loss) tr_loss += loss.item() if global_step % args.tb_writer_step == 0: tb_writer.add_scalar("loss", loss.item(), global_step) if scheduler is not None: tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) else: tb_writer.add_scalar("lr", args.learning_rate, global_step) if global_step > 0: prog_bar.set_description(f"Loss {tr_loss/global_step}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() if scheduler is not None: scheduler.step() optimizer.zero_grad() # Save model checkpoint to file. if (global_step > 0 and (args.checkpoint_steps > 0) and (global_step % args.checkpoint_steps) == 0): _save_model_checkpoint(model, args.output_dir, global_step) # Inc step counter. global_step += 1 # Print training accuracy, if we're tracking it. if total_predictions > 0: train_acc = correct_predictions / total_predictions logger.info(f"Train accuracy: {train_acc*100}%") tb_writer.add_scalar("epoch_train_score", train_acc, epoch) # Check accuracy after each epoch. # skip args.num_clean_epochs during adversarial training # if (not adversarial_training) or (epoch >= args.num_clean_epochs): if (epoch >= args.num_clean_epochs): eval_score = _get_eval_score(model, eval_dataloader, False) tb_writer.add_scalar("epoch_eval_score", eval_score, epoch) if args.checkpoint_every_epoch: _save_model_checkpoint(model, args.output_dir, args.global_step) logger.info( f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if eval_score > args.best_eval_score: args.best_eval_score = eval_score args.best_eval_score_epoch = epoch args.epochs_since_best_eval_score = 0 _save_model(model, args.output_dir, args.weights_name, args.config_name) logger.info( f"Best acc found. Saved model to {args.output_dir}.") _save_args(args, args_save_path) logger.info(f"Saved updated args to {args_save_path}") else: args.epochs_since_best_eval_score += 1 if (args.early_stopping_epochs > 0) and (args.epochs_since_best_eval_score > args.early_stopping_epochs): logger.info( f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased" ) break if args.check_robustness: samples_to_attack = list(zip(eval_text, eval_labels)) samples_to_attack = random.sample(samples_to_attack, 1000) adv_attack_results = _generate_adversarial_examples( model_wrapper, attack_class, samples_to_attack) attack_types = [r.__class__.__name__ for r in adv_attack_results] attack_types = collections.Counter(attack_types) adv_acc = 1 - (attack_types["SkippedAttackResult"] / len(adv_attack_results)) total_attacks = (attack_types["SuccessfulAttackResult"] + attack_types["FailedAttackResult"]) adv_succ_rate = attack_types[ "SuccessfulAttackResult"] / total_attacks after_attack_acc = attack_types["FailedAttackResult"] / len( adv_attack_results) tb_writer.add_scalar("robustness_test_acc", adv_acc, global_step) tb_writer.add_scalar("robustness_total_attacks", total_attacks, global_step) tb_writer.add_scalar("robustness_attack_succ_rate", adv_succ_rate, global_step) tb_writer.add_scalar("robustness_after_attack_acc", after_attack_acc, global_step) logger.info(f"Eval after-attack accuracy: {100*after_attack_acc}%") # read the saved model and report its eval performance logger.info( "Finished training. Re-loading and evaluating model from disk.") model_wrapper = model_from_args(args, args.num_labels) model = model_wrapper.model model.load_state_dict( torch.load(os.path.join(args.output_dir, args.weights_name))) eval_score = _get_eval_score(model, eval_dataloader, args.do_regression) logger.info( f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if args.save_last: _save_model(model, args.output_dir, args.weights_name, args.config_name) # end of training, save tokenizer try: tokenizer.save_pretrained(args.output_dir) logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.") except AttributeError: logger.warn( f"Error: could not save tokenizer {tokenizer} to {args.output_dir}." ) # Save a little readme with model info write_readme(args, args.best_eval_score, args.best_eval_score_epoch) _save_args(args, args_save_path) tb_writer.close() logger.info(f"Wrote final training args to {args_save_path}.")
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) train, dev, test, vocab = torch.load(open(options.data_file, 'rb'), pickle_module=dill) batched_train, batched_train_mask, _ = utils.tensor.advanced_batchize( train, options.batch_size, vocab.stoi["<pad>"]) batched_dev, batched_dev_mask, _ = utils.tensor.advanced_batchize( dev, options.batch_size, vocab.stoi["<pad>"]) vocab_size = len(vocab) rnnlm = BiGRU(vocab_size, use_cuda=use_cuda) if use_cuda > 0: rnnlm.cuda() else: rnnlm.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(rnnlm.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") rnnlm.train() for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train))): train_batch = Variable( batched_train[batch_i]) # of size (seq_len, batch_size) train_mask = Variable(batched_train_mask[batch_i]) train_in_mask = train_mask.view(-1) train_out_mask = train_mask.view(-1) if use_cuda: train_batch = train_batch.cuda() train_mask = train_mask.cuda() train_in_mask = train_in_mask.cuda() train_out_mask = train_out_mask.cuda() sys_out_batch = rnnlm( train_batch ) # (seq_len, batch_size, vocab_size) # TODO: substitute this with your module train_in_mask = train_in_mask.unsqueeze(1).expand( len(train_in_mask), vocab_size) sys_out_batch = sys_out_batch.view(-1, vocab_size) train_out_batch = train_batch.view(-1) sys_out_batch = sys_out_batch.masked_select(train_in_mask).view( -1, vocab_size) train_out_batch = train_out_batch.masked_select(train_out_mask) loss = criterion(sys_out_batch, train_out_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 rnnlm.eval() for batch_i in range(len(batched_dev)): dev_batch = Variable(batched_dev[batch_i], volatile=True) dev_mask = Variable(batched_dev_mask[batch_i], volatile=True) dev_in_mask = dev_mask.view(-1) dev_out_batch = dev_batch.view(-1) if use_cuda: dev_batch = dev_batch.cuda() dev_mask = dev_mask.cuda() dev_in_mask = dev_in_mask.cuda() dev_out_batch = dev_out_batch.cuda() sys_out_batch = rnnlm(dev_batch) dev_in_mask = dev_in_mask.unsqueeze(1).expand( len(dev_in_mask), vocab_size) dev_out_mask = dev_mask.view(-1) sys_out_batch = sys_out_batch.view(-1, vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_in_mask).view( -1, vocab_size) dev_out_batch = dev_out_batch.masked_select(dev_out_mask) loss = criterion(sys_out_batch, dev_out_batch) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: logging.info( "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})" .format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) break torch.save( rnnlm, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
def model_train_validate_test(train_df, dev_df, test_df, embeddings_file, vocab_file, target_dir, mode, num_labels=2, max_length=50, epochs=50, batch_size=128, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, if_save_model=False, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = My_Dataset(train_df, vocab_file, max_length, mode) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = My_Dataset(dev_df, vocab_file, max_length, mode) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") if (embeddings_file is not None): embeddings = load_embeddings(embeddings_file) else: embeddings = None model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device) total_params = sum(p.numel() for p in model.parameters()) print(f'{total_params:,} total parameters.') total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'{total_trainable_params:,} training parameters.') # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training BiGRU model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, _, = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 if (if_save_model): torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) print("save model succesfully!\n") print("* Test for epoch {}:".format(epoch)) _, _, test_accuracy, predictions = validate( model, test_loader, criterion) print("Test accuracy: {:.4f}%\n".format(test_accuracy)) test_prediction = pd.DataFrame({'prediction': predictions}) test_prediction.to_csv(os.path.join(target_dir, "test_prediction.csv"), index=False) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break