def run_train(opt, training_data_loader, validation_data_loader): if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) log_file = os.path.join(opt.checkpoint_dir, 'vgg_log.csv') print('[Initialize networks for training]') net = VGG(opt) L2_criterion = nn.MSELoss() print(net) if opt.resume: opt.start_epoch, net = load_model(opt, opt.checkpoint_dir) else: with open(log_file, mode='w') as f: f.write('epoch, train_loss, train_acc, valid_loss, valid_acc\n') print('===> Setting GPU') print('CUDA Available', torch.cuda.is_available()) if opt.use_cuda and torch.cuda.is_available(): opt.use_cuda = True opt.device = 'cuda' else: opt.use_cuda = False opt.device = 'cpu' if torch.cuda.device_count() > 1 and opt.multi_gpu: print("Use" + str(torch.cuda.device_count()) + 'GPUs') net = nn.DataParallel(net) if opt.use_cuda: net = net.to(opt.device) L2_criterion = L2_criterion.to(opt.device) print("===> Setting Optimizer") optimizer = torch.optim.Adam(net.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2)) for epoch in range(opt.start_epoch, opt.n_epochs): opt.epoch_num = epoch train_loss, train_acc = train(opt, net, optimizer, training_data_loader, loss_criterion=L2_criterion) valid_loss, valid_acc = evaluate(opt, net, validation_data_loader, loss_criterion=L2_criterion) with open(log_file, mode='a') as f: f.write("%d, %08f,%08f,%08f,%08f\n" % (epoch, train_loss, train_acc, valid_loss, valid_acc)) save_checkpoint(opt, net, epoch, valid_loss)
params.optimizer = torch.optim.SGD(trainable_vars, lr=init_lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov) # Train params.lr_scheduler = ReduceLROnPlateau(params.optimizer, 'min', factor=lr_decay, patience=10, cooldown=10, verbose=True) trainer = trainer.RedTrainer(model, params, train_dataloader, val_dataloader) trainer.train() elif train == "blue": # setting loss function params.criterion = nn.MSELoss(reduce=True, size_average=True) # load data print("Loading dataset...") dataset = DataReader() batch_size = batch_size if len( params.gpus) == 0 else batch_size * len(params.gpus) train_dataloader = DataLoader(dataset.get_training_set(), batch_size=batch_size, shuffle=True, num_workers=num_workers)
train_begin = time.time() for epoch in range(begin_epoch, h_params.max_epochs): train_queue = queue.Queue(h_params.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, h_params.batch_size, h_params.workers) train_loader.start() if epoch == 25: optimizer = optim.Adam(model.module.parameters(), lr = 0.00005 ) h_params.teacher_forcing = 0.99 train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, h_params.workers, 10, h_params.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(h_params.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, h_params.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join()
def main(): # Setup. parser = argparse.ArgumentParser() parser.add_argument("--config", default="./config/example.yaml") parser.add_argument("--gpu", default="0", type=str) # Path to checkpoint (empty string means the latest checkpoint) # or False (means training from scratch). parser.add_argument("--resume", default="", type=str) args = parser.parse_args() config, inner_dir, config_name = load_config(args.config) saved_dir = get_saved_dir(config, inner_dir, config_name, args.resume) storage_dir, ckpt_dir = get_storage_dir(config, inner_dir, config_name, args.resume) logger = get_logger(saved_dir, "adv_training.log", args.resume) # Prepare data. train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ]) test_transform = transforms.Compose([transforms.ToTensor()]) train_data = cifar.CIFAR10(root=config["dataset_dir"], transform=train_transform) test_data = cifar.CIFAR10(root=config["dataset_dir"], train=False, transform=test_transform) train_loader = DataLoader(train_data, batch_size=config["batch_size"], shuffle=True, num_workers=4) test_loader = DataLoader(test_data, batch_size=config["batch_size"], num_workers=4) # Resume training state. model = resnet_cifar.ResNet18() gpu = int(args.gpu) logger.info("Set GPU to {}".format(args.gpu)) model = model.cuda(gpu) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), **config["optimizer"]["SGD"]) scheduler = lr_scheduler.MultiStepLR( optimizer, **config["lr_scheduler"]["MultiStepLR"]) resumed_epoch = resume_state(model, optimizer, args.resume, ckpt_dir, scheduler) # Set attack first and then add a normalized layer. pgd_config = {} for k, v in config["pgd_attack"].items(): if k == "eps" or k == "alpha": pgd_config[k] = eval(v) else: pgd_config[k] = v attacker = PGD(model, **pgd_config) normalize_net = NormalizeByChannelMeanStd((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) normalize_net.cuda(gpu) model = nn.Sequential(normalize_net, model) for epoch in range(config["num_epochs"] - resumed_epoch): logger.info("===Epoch: {}/{}===".format(epoch + resumed_epoch + 1, config["num_epochs"])) logger.info("Adversarial training...") adv_train_result = train(model, train_loader, criterion, optimizer, logger, attacker=attacker) if scheduler is not None: scheduler.step() logger.info("Adjust learning rate to {}".format( optimizer.param_groups[0]["lr"])) logger.info("Test model on clean data...") clean_test_result = test(model, test_loader, criterion, logger) logger.info("Test model on adversarial data...") adv_test_result = test(model, test_loader, criterion, logger, attacker=attacker) result = { "adv_train": adv_train_result, "clean_test": clean_test_result, "adv_test": adv_test_result, } # Save checkpoint saved_dict = { "epoch": epoch + resumed_epoch + 1, "result": result, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } if scheduler is not None: saved_dict["scheduler_state_dict"] = scheduler.state_dict() torch.save( saved_dict, os.path.join(ckpt_dir, "epoch{}.pt".format(epoch + resumed_epoch + 1)), )
results_log = {'train_acc': [], 'train_loss': [], 'train_Lcls': [], 'train_Ldsne': [], 'val_acc_A': [], 'val_acc_B': [], 'val_acc_C': [], 'val_acc_D': [], 'val_loss_A': [], 'val_loss_B': [], 'val_loss_C': [], 'val_loss_D': [], # 'val_Lcls_A': [], 'val_Lcls_B': [], 'val_Lcls_C': [], 'val_Lcls_D': [], # 'val_Ldsne_A': [], 'val_Ldsne_B': [], 'val_Ldsne_C': [], 'val_Ldsne_D': [] } # train/val loop for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch+1, num_epochs)) print('-------------') for phase in ['train', 'val']: # train if phase == 'train': results_train = train(train_steps, extractor, classifier, loader_train, optimizer, train_criterion, device) results_log = log_dict(results_train, results_log, phase, None) # val else: results_A_val = val(val_steps, extractor, classifier, loader_a_val, val_criterion, device) results_log = log_dict(results_A_val, results_log, phase, 'A') results_B_val = val(val_steps, extractor, classifier, loader_b_val, val_criterion, device) results_log = log_dict(results_B_val, results_log, phase, 'B') results_C_val = val(val_steps, extractor, classifier, loader_c_val, val_criterion, device) results_log = log_dict(results_C_val, results_log, phase, 'C') results_D_val = val(val_steps, extractor, classifier, loader_d_val, val_criterion, device) results_log = log_dict(results_D_val, results_log, phase, 'D')
test_loader = DataLoader(test_set, shuffle=False, batch_size=len(test_set)) # Loss if args.loss.lower() == 'ce': loss_cls = nn.CrossEntropyLoss(weight=per_cls_weights).cuda() elif args.loss.lower() == 'ldam': loss_cls = LDAMLoss(cls_num_list=cls_num_list, max_m=0.5, s=30, weight=per_cls_weights).cuda() elif args.loss.lower() == 'focal': loss_cls = FocalLoss(weight=per_cls_weights, gamma=1).cuda() # TODO: Modifying the train and evaluation functions # Training model = train(epoch, model, optim_model, loss_cls, loss_reg, train_loader, gpu, args) if args.LR_schedule == True: scheduler.step() # Save the stage0 model if epoch % args.print_epoch == 0: result = test(epoch, model, loss_cls, test_loader, gpu, args) if args.earlystop == True: early(result['loss'], model, result) if early.early_stop == True: break if args.print_test == True: print('Epoch : %d, Test Acc : %2.2f, Test Loss : %.2f' %
if epoch == 0: train_loss, train_cls_loss, train_dom_loss, train_cls_acc, train_dom_acc = 0, 0, 0, 0, 0 val_loss, val_cls_loss, val_dom_loss, val_cls_acc, val_dom_acc = val( epoch, num_epochs, num_steps_val, net, loader_abc_test, cls_criterion, dom_criterion, device, hp_lambda, gamma) test_d_loss, test_d_acc = test(net, loader_d_test, cls_criterion, device) else: for phase in ['train', 'val', 'test']: if phase == 'train': train_loss, train_cls_loss, train_dom_loss, train_cls_acc, train_dom_acc = train( epoch, num_epochs, num_steps, net, loader_abc_train, cls_criterion, dom_criterion, device, optimizer2, hp_lambda, gamma) elif phase == 'val': val_loss, val_cls_loss, val_dom_loss, val_cls_acc, val_dom_acc = val( epoch, num_epochs, num_steps_val, net, loader_abc_test, cls_criterion, dom_criterion, device, hp_lambda, gamma) else: test_d_loss, test_d_acc = test(net, loader_d_test, cls_criterion, device) print('test D loss: {:.3f}, acc: {:.3f}'.format( test_d_loss, test_d_acc)) if test_d_acc > best_acc: torch.save(net.state_dict(), os.path.join(output_dir, 'model_best_acc.pth')) best_acc = test_d_acc