def main(): transformations = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) train_dataset = CifarDataset(TRAIN_CSV_PATH, TRAIN_IMG_PATH, transformations) train_loader = CifarDataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) test_dataset = CifarDataset(TEST_CSV_PATH, TEST_IMG_PATH, transformations) test_loader = CifarDataloader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) model = resnet50(pretrained=True, num_classes=10) criterion = nn.CrossEntropyLoss() if USE_GPU: model = model.cuda() criterion = criterion.cuda() optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) # load_checkpoint(os.path.join('checkpoint', 'last_checkpoint.pth.tar'), model, optimizer) for epoch in range(EPOCHS): train(train_loader, model, criterion, optimizer, epoch+1, USE_GPU) test(test_loader, model, USE_GPU) save_checkpoint({ 'epoch': epoch+1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join('checkpoint'))
def log_epoch_info(self, epoch, train_res, eval_res, epoch_elapsed): self.tb_sw.add_scalars( "epoch", { "train_loss": train_res["task_loss"], "train_acc": train_res["top1_acc"], "eval_loss": eval_res["task_loss"], "eval_acc": eval_res["top1_acc"], "lr": self.lr, "elapsed_time": epoch_elapsed, }, global_step=epoch, ) self.t_log.append( [ epoch, train_res["task_loss"], train_res["top1_acc"], eval_res["task_loss"], eval_res["top1_acc"], self.lr ] ) self.logger.info( "FIN Epoch %(epoch)d/%(epochs)d LR: %(lr)f | " + "Train Loss: %(tloss).4f Acc: %(tacc).2f | " + "Eval Loss: %(eloss).4f Acc: %(eacc).2f | " + "Took %(dt).1fs (%(tdt).1fs)", { "epoch": epoch, "epochs": self.epochs, "lr": self.lr, "tloss": train_res["task_loss"], "tacc": train_res["top1_acc"], "eloss": eval_res["task_loss"], "eacc": eval_res["top1_acc"], "dt": epoch_elapsed, "tdt": time() - self.exp_start, }, ) is_best = eval_res["top1_acc"] > self.best_acc1 self.best_acc1 = max(eval_res["top1_acc"], self.best_acc1) state_dict = self.model.state_dict() if self.gpu_ids and len(self.gpu_ids) > 1: # unwrap the torch.nn.DataParallel state_dict = list(self.model.children())[0].state_dict() save_checkpoint( { "epoch": epoch + 1, "state_dict": state_dict, "acc": eval_res["top1_acc"], "best_acc1": self.best_acc1, "optim_state_dict": self.optimizer.state_dict(), }, is_best, checkpoint_dir=self.config["chkpt_dir"], )
def run(self): modules = list(self.model.modules()) # Construct the model with smaller architecture if type(modules[0]) == VGG: binary_masks = [torch.Tensor([1, 1, 1]) ] # VGG Input RGB channels are not masked make_layers_config, pack_model = MaskablePackingAgent.gen_vgg_make_layers( modules, binary_masks) self.logger.info("Packed Model make_layers list: %s", make_layers_config) MaskablePackingAgent.transfer_vgg_parameters( self.model, pack_model, binary_masks) self.logger.info("Packed model: %s", pack_model) num_params = sum([p.numel() for p in pack_model.parameters()]) num_lrn_p = sum([ p.numel() for p in pack_model.parameters() if p.requires_grad ]) self.logger.info( "Num Parameters: %(params)d (%(lrn_params)d requires gradient)", { "params": num_params, "lrn_params": num_lrn_p }, ) save_checkpoint( { "make_layers": make_layers_config, "state_dict": pack_model.state_dict(), "params": num_params, "lrn_params": num_lrn_p, }, False, checkpoint_dir=self.config["chkpt_dir"], filename="vgg-pack-{:.2e}.pth.tar".format(num_params), ) else: raise NotImplementedError("Cannot pack sparse module: %s", modules[0])
def Train(Model, args): Nd = args.Nd beta1_Adam = args.beta1 beta2_Adam = args.beta2 if args.cuda: Model.cuda() optimizer = optim.Adam(Model.parameters(), lr=args.lr, betas=(beta1_Adam, beta2_Adam)) #optimizer = optim.SGD(Model.parameters(), lr=args.lr) Model.train() steps = 0 CUDNN.benchmark = True for epoch in range(args.start_epoch, args.epochs + 1): if args.step_learning: adjust_learning_rate(optimizer, epoch, args) transformed_dataset = FaceIdPoseDataset(args.train_csv_file, transform=transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.ToTensor() ])) dataloader = DataLoader(transformed_dataset, batch_size=args.Train_Batch, shuffle=True) for i, batch_data in enumerate(dataloader): Model.zero_grad() batch_image = torch.FloatTensor(batch_data[0].float()) batch_id_label = batch_data[2] if args.cuda: batch_image, batch_id_label = batch_image.cuda( ), batch_id_label.cuda() batch_image, batch_id_label = Variable(batch_image), Variable( batch_id_label) steps += 1 Prediction = Model(batch_image) Loss = Model.ID_Loss(Prediction, batch_id_label) Loss.backward() optimizer.step() log_learning(epoch, steps, 'VGG16_Model', args.lr, Loss.data, args) writer.add_scalar('Train/Train_Loss', Loss, steps) # Validation_Process(Model, epoch, writer, args) Validation_Process(Model, epoch, writer, args) if epoch % args.save_freq == 0: if not os.path.isdir(args.snapshot_dir): os.makedirs(args.snapshot_dir) save_path = os.path.join(args.snapshot_dir, 'epoch{}.pt'.format(epoch)) torch.save(Model.state_dict(), save_path) save_checkpoint( { 'epoch': epoch + 1, 'Model': Model.state_dict(), 'optimizer': optimizer.state_dict(), }, save_dir=os.path.join(args.snapshot_dir, 'epoch{}'.format(epoch))) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def train_single_fnm(D_model, G_model, C_model, args): writer = SummaryWriter() D_lr = args.lr G_lr = args.lr beta1_Adam = args.beta1 beta2_Adam = args.beta2 if args.cuda: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.cuda: D_model.to(device) G_model.to(device) C_model.to(device) optimizer_D = optim.Adam(D_model.parameters(), lr=D_lr, betas=(beta1_Adam, beta2_Adam), weight_decay=args.lambda_reg) optimizer_G = optim.Adam(G_model.parameters(), lr=G_lr, betas=(beta1_Adam, beta2_Adam), weight_decay=args.lambda_reg) if args.resume: checkpoint = torch.load(args.resume) optimizer_D.load_state_dict(checkpoint['optimizer_D']) optimizer_G.load_state_dict(checkpoint['optimizer_G']) steps = 0 CUDNN.benchmark = True for epoch in range(args.start_epoch, args.epochs + 1): D_model.train() G_model.train() C_model.eval() # Load augmented data profile_dataset = FaceIdPoseDataset( args.profile_list, args.data_place, transform=transforms.Compose([ torchvision.transforms.Resize(250), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])) front_dataset = FaceIdPoseDataset( args.front_list, args.data_place, transform=transforms.Compose([ torchvision.transforms.Resize(224), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])) profile_dataloader = DataLoader(profile_dataset, batch_size=args.batch_size, shuffle=True) #, num_workers=6) front_dataloader = DataLoader(front_dataset, batch_size=args.batch_size, shuffle=True) # , num_workers=6) for idx, _ in enumerate(profile_dataloader): batch_profile, imageName_profile = next(iter(profile_dataloader)) batch_front, imageName_front = next(iter(front_dataloader)) batch_profile = ((batch_profile + 1) * 127.5).to(device) batch_front = ((batch_front + 1) * 127.5).to(device) steps += 1 enable_gradients(D_model) disable_gradients(C_model) disable_gradients(G_model) if steps < 25 and epoch == 1: critic = 25 else: critic = args.num_critic_D for _ in range(0, critic): D_model.zero_grad() # Create Encoder Feature Map / Get the Real images' Features _, Front_FeaMap = C_model(batch_front) _, Profile_FeaMap = C_model(batch_profile) gen_f = G_model(Front_FeaMap) gen_p = G_model(Profile_FeaMap) # Mapping to single unit by using Discriminator syn_f_gan = D_model(gen_f) syn_p_gan = D_model(gen_p) real_gan = D_model(batch_front) # Gradient Penalty gp_alpha = torch.FloatTensor(batch_front.size()[0], 1, 1, 1).to(device) gp_alpha.uniform_() interpolates = gen_p.data * gp_alpha + ( 1 - gp_alpha) * batch_front.data interpolates = interpolates.to( device).requires_grad_() # requires_grad_() 開啟張量 Loss, Wdis, GP = D_model.CriticWithGP_Loss( syn_f_gan, syn_p_gan, real_gan, interpolates) L_D = Loss L_D.backward() optimizer_D.step() writer.add_scalar('Discriminator/Gradient-Penalty', GP, steps) writer.add_scalar('Discriminator/Wasserstein-Distance', Wdis, steps) writer.add_scalar('Discriminator/D-LOSS', Loss, steps) log_learning(epoch, steps, 'D', D_lr, L_D.data, args) enable_gradients(G_model) disable_gradients(D_model) disable_gradients(C_model) for _ in range(0, args.num_critic_G): G_model.zero_grad() """Loss Functions 1. Pixel-Wise Loss: front-to-front reconstruct 2. Perceptual Loss: Feature distance on space of pretrined face model 3. Regulation Loss: L2 weight regulation (Aleady included in nn.Adam) 4. Adversarial Loss: Wasserstein Distance 5. Symmetric Loss: NOT APPLY 6. Drift Loss: NOT APPLY 7. Grade Penalty Loss: Grade penalty for Discriminator """ # Create Encoder Feature Map / Get the Real images' Features Front_Fea, Front_FeaMap = C_model(batch_front) Profile_Fea, Profile_FeaMap = C_model(batch_profile) # Synthesized image / Get the Fake images' Features gen_f = G_model(Front_FeaMap) gen_p = G_model(Profile_FeaMap) Front_Syn_Fea, _ = C_model(gen_f) Profile_Syn_Fea, _ = C_model(gen_p) # Mapping to single unit by using Discriminator syn_f_gan = D_model(gen_f) syn_p_gan = D_model(gen_p) # Frontalization Loss: L1-Norm L1 = G_model.L1Loss(gen_f, batch_front) #(input, target) # Feature Loss: Cosine-Norm / L2-Norm L2 = G_model.L2Loss(Front_Syn_Fea, Front_Fea, Profile_Syn_Fea, Profile_Fea) # Adversarial Loss L_Gen = G_model.GLoss(syn_f_gan, syn_p_gan) # L2 Regulation Loss (L2 regularization on the parameters of the model is already included in most optimizers) L_G = args.lambda_l1 * L1 + args.lambda_fea * L2 + args.lambda_gan * L_Gen L_G.backward() optimizer_G.step() writer.add_scalar('Generator/Pixel-Wise-Loss', L1, steps) writer.add_scalar('Generator/Perceptual-Loss', L2, steps) writer.add_scalar('Generator/Adversarial Loss', L_Gen, steps) writer.add_scalar('Generator/G-LOSS', L_Gen, steps) log_learning(epoch, steps, 'G', G_lr, L_G.data, args) if steps % 500 == 0: x_r = vutils.make_grid(batch_front, normalize=True, scale_each=True) y_r = vutils.make_grid(batch_profile, normalize=True, scale_each=True) x_f = vutils.make_grid(gen_f, normalize=True, scale_each=True) y_f = vutils.make_grid(gen_p, normalize=True, scale_each=True) writer.add_image('Image/Front-Real', x_r, steps) writer.add_image('Image/Front-Generated', x_f, steps) writer.add_image('Image/Profile-Real', y_r, steps) writer.add_image('Image/Profile-Generated', y_f, steps) save_path_image = os.path.join( args.snapshot_dir, 'epoch{}_FrontInput.jpg'.format(epoch)) torchvision.utils.save_image(batch_front, save_path_image, normalize=True, scale_each=True) save_path_image = os.path.join( args.snapshot_dir, 'epoch{}_FrontSynthesized.jpg'.format(epoch)) torchvision.utils.save_image(gen_f, save_path_image, normalize=True, scale_each=True) save_path_image = os.path.join( args.snapshot_dir, 'epoch{}_ProfileInput.jpg'.format(epoch)) torchvision.utils.save_image(batch_profile, save_path_image, normalize=True, scale_each=True) save_path_image = os.path.join( args.snapshot_dir, 'epoch{}_ProfileSynthesized.jpg'.format(epoch)) torchvision.utils.save_image(gen_p, save_path_image, normalize=True, scale_each=True) if epoch % args.save_freq == 0: if not os.path.isdir(args.snapshot_dir): os.makedirs(args.snapshot_dir) save_path_D = os.path.join(args.snapshot_dir, 'epoch{}_D.pt'.format(epoch)) torch.save(D_model.state_dict(), save_path_D) save_path_G = os.path.join(args.snapshot_dir, 'epoch{}_G.pt'.format(epoch)) torch.save(G_model.state_dict(), save_path_G) save_checkpoint( { 'epoch': epoch + 1, 'D_model': D_model.state_dict(), 'optimizer_D': optimizer_D.state_dict(), 'G_model': G_model.state_dict(), 'optimizer_G': optimizer_G.state_dict(), }, save_dir=os.path.join(args.snapshot_dir, 'epoch{}'.format(epoch))) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(): opt = BaseOptions().parse() # get options exp_dir = osp.join(opt.checkpoints_dir, opt.name) log_file = osp.join(exp_dir, "trainlog.txt") logger = Logger(log_file) use_gpu = torch.cuda.is_available() torch.manual_seed(opt.seed) torch.cuda.manual_seed_all(opt.seed) # Read and initialize dataset phys_net_data = PhysNetReal(opt.dataroot) # Construct train and test transform operations transform_train = Compose([ ToTensor(), ]) transform_test = Compose([ ToTensor(), ]) # PyTorch Dataset classes for train, validation and test sets train_dataset = O2P2Dataset(phys_net_data.train, transform=transform_train) val_dataset = O2P2Dataset(phys_net_data.val, transform=transform_test) test_dataset = O2P2Dataset(phys_net_data.test, transform=transform_test) # PyTorch Dataloaders for train, validation and test sets train_loader = DataLoader(train_dataset, batch_size=opt.train_batch_size, shuffle=True, pin_memory=use_gpu) val_loader = DataLoader(val_dataset, batch_size=opt.test_batch_size, shuffle=False, pin_memory=use_gpu) test_loader = DataLoader(test_dataset, batch_size=opt.test_batch_size, shuffle=False, pin_memory=use_gpu) # Initialize model percept = Percept() physics = Physics() render = Render() if use_gpu: percept = percept.cuda() physics = physics.cuda() render = render.cuda() # Initialize pretrained vgg model for perceptual loss vgg = Vgg16(requires_grad=False) vgg.eval() if use_gpu: vgg = vgg.cuda() # VGG network expects images that are normalized with these mean and std vgg_normalization_mean = torch.tensor([0.485, 0.456, 0.406]) vgg_normalization_std = torch.tensor([0.229, 0.224, 0.225]) if use_gpu: vgg_normalization_mean = vgg_normalization_mean.cuda() vgg_normalization_std = vgg_normalization_std.cuda() # Initialize normalizer that is required by vgg model vgg_norm = Normalization(vgg_normalization_mean, vgg_normalization_std) if use_gpu: vgg_norm = vgg_norm.cuda() # Define loss and optimizers criterion = torch.nn.MSELoss() optim_percept = torch.optim.Adam(percept.parameters(), lr=1e-3) optim_physics = torch.optim.Adam(physics.parameters(), lr=1e-3) optim_render = torch.optim.Adam(render.parameters(), lr=1e-3) best_render_loss = np.inf best_epoch = 0 print("==> Start training") # Start training for epoch in range(opt.max_epoch): start_time = time.time() # train for one epoch percept_loss, physics_loss, render_loss = train(epoch, train_loader, percept, physics, render, criterion, vgg, vgg_norm, optim_percept, optim_physics, optim_render, use_gpu, exp_dir, logger, opt) elapsed_time = time.time() - start_time # print training details print_train_stats(logger, epoch, elapsed_time, percept_loss, physics_loss, render_loss) if (epoch + 1) % opt.eval_freq == 0: percept_loss, physics_loss, render_loss = validate(epoch, val_loader, percept, physics, render, criterion, vgg, vgg_norm, use_gpu, exp_dir, logger, opt) is_best = render_loss < best_render_loss if is_best: best_render_loss = render_loss best_epoch = epoch + 1 percept_state_dict = percept.state_dict() physics_state_dict = physics.state_dict() render_state_dict = render.state_dict() save_checkpoint({ 'percept_state_dict': percept_state_dict, 'physics_state_dict': physics_state_dict, 'render_state_dict': render_state_dict, 'epoch': epoch, }, is_best, osp.join(exp_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) logger.log("==> Best Render Loss {:.4%}, achieved at epoch {}".format(best_render_loss, best_epoch)) logger.log("Training completed.")
def log_epoch_info(self, epoch, train_res, eval_res, epoch_elapsed): param_usage = 0 epoch_sparsity = {} for idx, mask_module in enumerate(self.mask_modules): mask, factor = mask_module.get_binary_mask() mask_sparsity = sum(mask.view(-1)) param_usage += sum(mask.view(-1) * factor) epoch_sparsity["{:02d}".format(idx)] = mask_sparsity self.tb_sw.add_scalars("epoch_sparsity", epoch_sparsity, global_step=epoch) self.tb_sw.add_scalar("epoch_params", param_usage, global_step=epoch) self.tb_sw.add_scalars( "epoch", { "train_acc": train_res["top1_acc"], "train_task_loss": train_res["task_loss"], "train_kd_loss": train_res["kd_loss"], "train_mask_loss": train_res["mask_loss"], "eval_acc": eval_res["top1_acc"], "eval_task_loss": eval_res["task_loss"], "eval_kd_loss": eval_res["kd_loss"], "eval_mask_loss": eval_res["mask_loss"], "lr": self.lr, "elapsed_time": epoch_elapsed, }, global_step=epoch, ) self.t_log.append( [ epoch, train_res["task_loss"], train_res["kd_loss"], train_res["mask_loss"], train_res["top1_acc"], eval_res["task_loss"], eval_res["kd_loss"], eval_res["mask_loss"], eval_res["top1_acc"], param_usage, self.lr ] ) self.logger.info( "FIN Epoch %(epoch)d/%(epochs)d LR: %(lr)f | " + "Train Task Loss: %(ttl).4f KDL: %(tkl).4f Mask Loss: %(tml).4f Acc: %(tacc).2f | " + "Eval Acc: %(eacc).2f | Params: %(params).2e | " + "Took %(dt).1fs (%(tdt).1fs)", { "epoch": epoch, "epochs": self.epochs, "lr": self.lr, "ttl": train_res["task_loss"], "tkl": train_res["kd_loss"], "tml": train_res["mask_loss"], "tacc": train_res["top1_acc"], "eacc": eval_res["top1_acc"], "dt": epoch_elapsed, "params": param_usage, "tdt": time() - self.exp_start, }, ) is_best_key = "{:.1e}".format(param_usage) prev_usage_best_acc = self.best_acc_per_usage.get(is_best_key, 0) usage_best_acc = max(eval_res["top1_acc"], prev_usage_best_acc) self.best_acc_per_usage[is_best_key] = usage_best_acc is_best = eval_res["top1_acc"] > prev_usage_best_acc state_dict = self.model.state_dict() if self.gpu_ids and len(self.gpu_ids) > 1: # unwrap the torch.nn.DataParallel state_dict = list(self.model.children())[0].state_dict() save_checkpoint( { "epoch": epoch + 1, "state_dict": state_dict, "acc": eval_res["top1_acc"], "best_acc_per_usage": self.best_acc_per_usage, "optim_state_dict": self.optimizer.state_dict(), "param_usage": param_usage, }, is_best, checkpoint_dir=self.config["chkpt_dir"], filename="checkpoint-{}.pth.tar".format(is_best_key), best_filename="checkpoint-{}.pth.tar".format(is_best_key), )
def Train(Model, args): writer = SummaryWriter() beta1_Adam = args.beta1 beta2_Adam = args.beta2 if args.cuda: Model.cuda() #optimizer = optim.Adam(Model.parameters(), lr=args.lr, betas=(beta1_Adam, beta2_Adam)) optimizer = optim.SGD(Model.parameters(), lr=args.lr) if args.resume: checkpoint = torch.load(args.resume) optimizer.load_state_dict(checkpoint['optimizer']) Model.train() steps = 0 #loss_criterion_Angular = AngleLoss().cuda() CUDNN.benchmark = True if args.stepsize > 0: scheduler = lr_scheduler.StepLR(optimizer, step_size=args.stepsize, gamma=args.gamma) if args.dynamic_lr == True: scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3000, verbose=False, threshold=0.00001, threshold_mode='rel', cooldown=2000, min_lr=0, eps=1e-08) for epoch in range(args.start_epoch, args.epochs+1): #if epoch==3: #optimizer = optim.SGD(Model.parameters(), lr=args.lr) # Every args.lr_step, changes learning rate by multipling args.lr_decay #adjust_learning_rate(optimizer, epoch, args) # Load augmented data #transformed_dataset = FaceIdPoseDataset(args.train_csv_file, args.data_place, #transform = transforms.Compose([Resize((256,256)), RandomCrop((224,224))])) #for ResNet256x256->224x224 for VGG110x110->96x96 # transformed_dataset = FaceIdPoseDataset(args.train_csv_file, args.data_place, # transforms.Compose([transforms.Resize(256), transforms.RandomCrop(224),transforms.ToTensor()])) # for ResNet256x256->224x224 for VGG110x110->96x96 transformed_dataset = FaceIdPoseDataset(args.train_csv_file, args.data_place,transforms.Compose([transforms.Resize(256), transforms.RandomCrop(224), transforms.ToTensor() ])) # for ResNet256x256->224x224 for VGG110x110->96x96 dataloader = DataLoader(transformed_dataset, batch_size=args.Train_Batch, shuffle=True, num_workers=8) if args.stepsize > 0: scheduler.step() for i, batch_data in enumerate(dataloader): # backward() function accumulates gradients, however we don't want to mix up gradients between minibatches optimizer.zero_grad() batch_image = torch.FloatTensor(batch_data[0].float()) batch_id_label = batch_data[2] if args.cuda: batch_image, batch_id_label = batch_image.cuda(), batch_id_label.cuda() batch_image, batch_id_label = Variable(batch_image), Variable(batch_id_label) steps += 1 Prediction = Model(batch_image) Loss = Model.ID_Loss(Prediction, batch_id_label) #Loss = loss_criterion_Angular(Prediction, batch_id_label) Loss.backward() optimizer.step() if args.dynamic_lr == True: scheduler.step(Loss) log_learning(epoch, steps, 'ResNet50_Model', args.lr, Loss.item(), args) writer.add_scalar('Train/Train_Loss', Loss, steps) writer.add_scalar('Train/Model_Lr', optimizer.param_groups[0]['lr'], epoch) # Validation_Process(Model, epoch, writer, args) Validation_Process(Model, epoch, writer, args) if epoch % args.save_freq == 0: if not os.path.isdir(args.snapshot_dir): os.makedirs(args.snapshot_dir) save_path = os.path.join(args.snapshot_dir, 'epoch{}.pt'.format(epoch)) torch.save(Model.state_dict(), save_path) save_checkpoint({ 'epoch': epoch + 1, 'Model': Model.state_dict(), 'optimizer': optimizer.state_dict(), }, save_dir=os.path.join(args.snapshot_dir, 'epoch{}'.format(epoch))) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def Train(Model, args): #Define num of classes Nd = args.Nd beta1_Adam = args.beta1 beta2_Adam = args.beta2 #Define gpu mode if args.cuda: Model.cuda() #choose your optimizer optimizer = optim.Adam(Model.parameters(), lr=args.lr, betas=(beta1_Adam, beta2_Adam)) if args.resume: checkpoint = torch.load(args.resume) optimizer.load_state_dict(checkpoint['optimizer']) Model.train() loss_criterion = nn.CrossEntropyLoss().cuda() steps = 0 CUDNN.benchmark = True for epoch in range(args.start_epoch, args.epochs + 1): # Every args.lr_step, changes learning rate by multipling args.lr_decay if args.step_learning: adjust_learning_rate(optimizer, epoch, args) # Load augmented data transformed_dataset = FaceIdPoseDataset(args.train_csv_file, args.data_place, transform=transforms.Compose([ Resize((256, 256)), RandomCrop((224, 224)) ])) dataloader = DataLoader(transformed_dataset, batch_size=args.Train_Batch, shuffle=True) for i, batch_data in enumerate(dataloader): # backward() function accumulates gradients, however we don't want to mix up gradients between minibatches Model.zero_grad() batch_image = torch.FloatTensor(batch_data[0].float()) batch_id_label = batch_data[2] if args.cuda: batch_image, batch_id_label = batch_image.cuda( ), batch_id_label.cuda() batch_image, batch_id_label = Variable(batch_image), Variable( batch_id_label) steps += 1 Prediction = Model(batch_image) Loss = loss_criterion(Prediction[:, :Nd], batch_id_label) Loss.backward() optimizer.step() log_learning(epoch, steps, 'VGG16_Model', args.lr, Loss.data[0], args) writer.add_scalar('Train/Train_Loss', Loss, steps) Validation_Process(Model, epoch, writer, args) if epoch % args.save_freq == 0: if not os.path.isdir(args.snapshot_dir): os.makedirs(args.snapshot_dir) save_path = os.path.join(args.snapshot_dir, 'epoch{}.pt'.format(epoch)) torch.save(Model.state_dict(), save_path) save_checkpoint( { 'epoch': epoch + 1, 'Model': Model.state_dict(), 'optimizer': optimizer.state_dict(), }, save_dir=os.path.join(args.snapshot_dir, 'epoch{}'.format(epoch))) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def log_epoch_info(self, epoch, train_res, eval_res, epoch_type, epoch_elapsed): param_usage = 0 epoch_sparsity = {} mask_idx = 0 for module in self.model.modules(): if len(list(module.children())) > 0: # only count leaf node modules continue elif type(module) == MaskSTE: mask, factor = module.get_binary_mask() mask_sparsity = sum(mask.view(-1)) param_usage += sum(mask.view(-1) * factor) epoch_sparsity["{:02d}".format(mask_idx)] = mask_sparsity mask_idx += 1 if mask_idx == 0: param_usage = sum([p.numel() for p in self.model.parameters()]) if len(epoch_sparsity) > 0: self.tb_sw.add_scalars("epoch_sparsity", epoch_sparsity, global_step=epoch) self.tb_sw.add_scalar("epoch_params", param_usage, global_step=epoch) epoch_scalars = { "train_acc": train_res["top1_acc"], "train_task_loss": train_res["task_loss"], "train_kd_loss": train_res["kd_loss"], "eval_acc": eval_res["top1_acc"], "eval_task_loss": eval_res["task_loss"], "eval_kd_loss": eval_res["kd_loss"], "lr": self.lr, "elapsed_time": epoch_elapsed, } if epoch_type == "Sparsity": epoch_scalars["train_mask_loss"] = train_res["mask_loss"] epoch_scalars["eval_mask_loss"] = eval_res["mask_loss"] self.tb_sw.add_scalars("epoch", epoch_scalars, global_step=epoch) self.t_log.append([ epoch, train_res["task_loss"], train_res["kd_loss"], train_res["mask_loss"], train_res["top1_acc"], eval_res["task_loss"], eval_res["kd_loss"], eval_res["mask_loss"], eval_res["top1_acc"], self.lr, param_usage ]) self.logger.info( "FIN %(epoch_type)s Epoch %(epoch)d/%(epochs)d LR: %(lr).1e | " + "Train Task Loss: %(ttl).4f KDL: %(tkl).4f Mask Loss: %(tml).4f Acc: %(tacc).2f | " + "Eval Acc: %(eacc).2f | Params: %(params).2e | " + "Took %(dt).1fs (%(tdt).1fs)", { "epoch_type": epoch_type, "epoch": epoch, "epochs": self.epochs, "lr": self.lr, "ttl": train_res["task_loss"], "tkl": train_res["kd_loss"], "tml": train_res["mask_loss"], "tacc": train_res["top1_acc"], "eacc": eval_res["top1_acc"], "dt": epoch_elapsed, "params": param_usage, "tdt": time() - self.exp_start, }, ) is_best_key = "{:.1e}".format(param_usage) prev_usage_best_acc = self.best_acc_per_usage.get(is_best_key, 0) usage_best_acc = max(eval_res["top1_acc"], prev_usage_best_acc) self.best_acc_per_usage[is_best_key] = usage_best_acc is_best = eval_res["top1_acc"] > prev_usage_best_acc state_dict = self.model.state_dict() if self.gpu_ids and len(self.gpu_ids) > 1: # unwrap the torch.nn.DataParallel state_dict = list(self.model.children())[0].state_dict() save_checkpoint( { "epoch": epoch + 1, "state_dict": state_dict, "acc": eval_res["top1_acc"], "best_acc_per_usage": self.best_acc_per_usage, "optim_state_dict": self.optimizer.state_dict(), "param_usage": param_usage, }, is_best, checkpoint_dir=self.config["chkpt_dir"], filename="checkpoint-{}.pth.tar".format(is_best_key), best_filename="checkpoint-{}.pth.tar".format(is_best_key), )