def save(self, current_epoch, best=False): if best: save_checkpoint(self.config, self.kwargs['config_name'], self.model, current_epoch, self.loss_basic.val, self.optimizer, self.logger, self.kwargs['time_stamp'], self.accuarcy, flag='best', verbose=(self.kwargs['cae_type'] + '#' + self.verbose)) self.result_path = save_model(self.config, self.kwargs['config_name'], self.model, self.logger, self.kwargs['time_stamp'], self.accuarcy, verbose=(self.kwargs['cae_type'] + '#' + self.verbose)) else: save_checkpoint(self.config, self.kwargs['config_name'], self.model, current_epoch, self.loss_basic.val, self.optimizer, self.logger, self.kwargs['time_stamp'], self.accuarcy, verbose=(self.kwargs['cae_type'] + '#' + self.verbose))
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) dataset_type = get_dataset(config) train_data = dataset_type(config, is_train=True) train_loader = DataLoader(dataset=train_data, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_loader = DataLoader(dataset=dataset_type(config, is_train=False), batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # config.MODEL.NUM_JOINTS = train_data.get_num_points() model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } model = nn.DataParallel(model, device_ids=gpus).cuda() # loss criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluate nme, predictions = function.validate(config, val_loader, model, criterion, epoch, writer_dict) is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(cfg): util.init_random_seed(cfg.seed) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.gpu_devices use_gpu = torch.cuda.is_available() if cfg.use_cpu: use_gpu = False cfg.ckpt_dir = "test" if cfg.evaluate else cfg.ckpt_dir cfg.save_dir = osp.join(cfg.save_dir, cfg.ckpt_dir) util.mkdir_if_missing(cfg.save_dir) if not cfg.evaluate: sys.stdout = util.Logger(osp.join(cfg.save_dir, 'log_train.txt')) else: sys.stdout = util.Logger(osp.join(cfg.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(cfg)) if use_gpu: print("Currently using GPU {}".format(cfg.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(cfg.seed) else: print("Currently using CPU (GPU is highly recommended)") # -------------------------------------------------------------------------------------------- print("* Initializing dataset {}".format(cfg.dataset)) dataset = getdata.init_dataset(name=cfg.dataset) cfg.num_train_pids = dataset.num_train_pids transform_train = T.Compose([ T.Random2DTranslation(cfg.height, cfg.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.Resize((cfg.height, cfg.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( # train_set, VideoDataset(dataset.train, seq_len=cfg.seq_len, sample=cfg.train_sample_method, transform=transform_train), sampler=RandomIdentitySampler(dataset.train, batch_size=cfg.train_batch, num_instances=cfg.num_instances), batch_size=cfg.train_batch, num_workers=cfg.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( # query_set, VideoDataset(dataset.query, seq_len=cfg.seq_len, sample=cfg.test_sample_method, transform=transform_test), batch_size=cfg.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( # gallery_set, VideoDataset(dataset.gallery, seq_len=cfg.seq_len, sample=cfg.test_sample_method, transform=transform_test), batch_size=cfg.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False, ) # -------------------------------------------------------------------------------------------- # Initialize model, optimizer, and scheduler print("* Initializing model: {}".format(cfg.arch)) model = get_model(cfg) print("Model size: {:.2f}M".format(sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion_xent = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids, use_gpu=use_gpu) criterion_htri = TripletLoss(margin=cfg.margin) optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay) assert cfg.stepsize > 0 scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg.stepsize, gamma=cfg.gamma, last_epoch=-1) if cfg.warmup_epoch > 0: scheduler = WarmUpLR(optimizer, scheduler, cfg.warmup_epoch, len(trainloader)) # -------------------------------------------------------------------------------------------- # optionally resume from a checkpoint best_rank1 = -np.inf start_epoch = cfg.start_epoch if cfg.resume: checkpoint = torch.load(cfg.resume) start_epoch = checkpoint['epoch'] + 1 best_rank1 = checkpoint['rank1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg.stepsize, gamma=cfg.gamma, last_epoch=checkpoint['epoch']) print("loaded checkpoint '{}' (epoch {})".format(cfg.resume, checkpoint['epoch'])) del checkpoint if use_gpu: model = nn.DataParallel(model).cuda() if cfg.evaluate: print("* Evaluating") with torch.no_grad(): evaluate(model, queryloader, galleryloader, cfg.pool, use_gpu) return if cfg.arch == '3d': torch.backends.cudnn.benchmark = False # -------------------------------------------------------------------------------------------- print("\n* Start training") start_time = time.time() for epoch in range(start_epoch, cfg.max_epoch): epoch_start_time = time.time() update_lr(scheduler, epoch, n_iter=None) train_one_epoch( model, epoch, optimizer, scheduler, trainloader, cfg.warmup_epoch, criterion_xent, criterion_htri, use_gpu ) lr_msg = 'used lr: ' for item in [pg['lr'] for pg in optimizer.param_groups]: lr_msg += '%.0E ' % (item) print('* end of epoch {}/{}, time taken: {:.0f} sec, {}'.format( epoch, cfg.max_epoch, time.time() - epoch_start_time, lr_msg)) # scheduler.step(epoch + 1) # setting lr for next epoch, self.last_epoch==epoch+1 if epoch in cfg.eval_steps or (epoch + 1) == cfg.max_epoch: print("* evaluate") with torch.no_grad(): rank1 = eval(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() util.save_checkpoint({ 'rank1': rank1, 'epoch': epoch, 'state_dict': state_dict, 'optimizer': optimizer.state_dict(), }, is_best, osp.join(cfg.save_dir, 'latest.pth')) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Checkpoints are saved to {}".format(cfg.save_dir)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.'+config.MODEL.NAME+'.get_seg_net')( config, is_train=True ) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, 'vis_global_steps': 0, } # dump_input = torch.rand((config.TRAIN.BATCH_SIZE, # 3, # config.MODEL.IMAGE_SIZE[1], # config.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() optimizer = get_optimizer(config, model) # Data loading code if 'xception' in config.MODEL.NAME: # Xception uses different mean std for input image normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_augs = aug.Compose([aug.RandomScale(0.5, 2.0), aug.RandomHorizontallyFlip(0.5), aug.RandomSizedCrop(config.MODEL.IMAGE_SIZE)]) test_augs = None train_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), augmentations=train_augs ) valid_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), augmentations=test_augs ) # define loss function (criterion) and optimizer criterion = CrossEntropy2D(ignore_index=255, weight=train_dataset.class_weights).cuda() train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True, drop_last=True if len(gpus) > 2 else False # PyTorch's DataParallel model cannot handle 0 image on either of the GPUs ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True ) if config.TRAIN.LR_SCHEDULER == 'multistep': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR ) elif config.TRAIN.LR_SCHEDULER == 'poly': max_iter = config.TRAIN.END_EPOCH * len(train_loader) lr_scheduler = PolynomialLR(optimizer, max_iter=max_iter, decay_iter=1) elif config.TRAIN.LR_SCHEDULER == 'none': lr_scheduler = None else: raise ValueError('Scheduler {} not supported'.format(config.TRAIN.LR_SCHEDULER)) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): if config.TRAIN.LR_SCHEDULER == 'multistep': lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, lr_scheduler, epoch, final_output_dir, tb_log_dir, writer_dict) if (epoch + 1) % config.TRAIN.EVAL_INTERVAL == 0: if not config.MODEL.LEARN_GAMMA: if float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) <= 1: gamma = (config.TRAIN.NE_GAMMA_U - config.TRAIN.NE_GAMMA_L) * \ (1 - float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) ) ** \ config.TRAIN.NE_GAMMA_EXP + config.TRAIN.NE_GAMMA_L else: gamma = config.TRAIN.NE_GAMMA_L else: gamma = None # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict, gamma=gamma) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) else: perf_indicator = 0.0 final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.gpu) reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('{}.get_pose_net'.format(args.model))(config, is_train=True) model.eval() params = count_parameters_in_MB(model) logger.info("Params = %.2fMB" % params) mult_adds = comp_multadds(model, input_size=(3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) logger.info("Mult-Adds = %.2fMB" % mult_adds) model.train() model = model.cuda() # copy model file # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir) # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) lr_scheduler.step()
def main(): args = parse_args() update_config(cfg, args) logger, final_output_dir, tb_log_dir = create_logger( cfg, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(cfg) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir) # logger.info(pprint.pformat(model)) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input)) logger.info(get_model_summary(model, dump_input)) model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda() # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True, transforms.Compose([transforms.ToTensor(), normalize])) valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False, transforms.Compose([transforms.ToTensor(), normalize])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=cfg.TRAIN.SHUFFLE, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) best_perf = 0.0 best_model = False last_epoch = -1 optimizer = get_optimizer(cfg, model) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) lr_scheduler.step() # evaluate on validation set perf_indicator = validate(cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def do_train(train_loader, val_loader, model, indicator_dict, cfg, writer_dict, final_output_dir, log_dir, visualize): batch_time = AverageMeter() data_time = AverageMeter() end = time.time() for i, current_data in enumerate( train_loader, start=indicator_dict['current_iteration']): data_time.update(time.time() - end) if i > indicator_dict['total_iteration']: return # validation if indicator_dict[ 'current_iteration'] % cfg.VAL.EVALUATION_FREQUENCY == 0: indicator_dict['current_performance'] = do_validate( val_loader, model, cfg, visualize, writer_dict, final_output_dir) indicator_dict['is_best'] = False if indicator_dict['current_performance'] < indicator_dict[ 'best_performance']: indicator_dict['best_performance'] = indicator_dict[ 'current_performance'] indicator_dict['is_best'] = True # save checkpoint output_dictionary = { 'indicator_dict': indicator_dict, 'writer_dict_train_global_steps': writer_dict['train_global_steps'], 'writer_dict_val_global_steps': writer_dict['val_global_steps'], 'tb_log_dir': log_dir } if hasattr(model, 'generator'): output_dictionary['generator'] = model.generator.state_dict() output_dictionary[ 'optimizer_generator'] = model.optimizer_generator.state_dict( ) if hasattr(model, 'discriminator'): output_dictionary[ 'discriminator'] = model.discriminator.state_dict() output_dictionary[ 'optimizer_discriminator'] = model.optimizer_discriminator.state_dict( ) save_checkpoint(output_dictionary, indicator_dict, final_output_dir) model.train() # train model.set_dataset(current_data) model.optimize_parameters() # visualize if indicator_dict[ 'current_iteration'] % cfg.TRAIN.DISPLAY_FREQUENCY == 0 and cfg.IS_VISUALIZE: visualize(model, indicator_dict['current_iteration'], os.path.join(final_output_dir, "train"), cfg.TRAIN.DISPLAY_FREQUENCY) # update learning rate for current_scheduler in model.schedulers: current_scheduler.step() batch_time.update(time.time() - end) end = time.time() model.record_information(i, len(train_loader), batch_time, data_time, indicator_dict, writer_dict, phase='train')
def main(): # set all the configurations args = parse_args() update_config(cfg, args) # set the logger, tb_log_dir means tensorboard logdir logger, final_output_dir, tb_log_dir = create_logger( cfg, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(cfg) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED # bulid up model model = get_net(cfg) model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda() # Data loading normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=cfg.TRAIN.SHUFFLE, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) # define loss function (criterion) and optimizer criterion = get_loss(cfg).cuda() optimizer = get_optimizer(cfg, model) # load checkpoint model best_perf = 0.0 best_model = False last_epoch = -1 lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) # training for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, writer_dict) lr_scheduler.step() # evaluate on validation set if epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH + 1: perf_indicator = validate(cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False # save checkpoint model and best model logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # save final model final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): final_output_dir = 'output' args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger(config, 'train') logger.info(pprint.pformat(config)) # CuDNN cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # HRNet Model mv_hrnet = get_pose_net(config, is_train=True) #pose_hrnet = get_pose_net(config, is_train=True) # Pose estimation model #pose_hrnet.load_state_dict(torch.load(config.NETWORK.PRETRAINED), strict=False) # Pretrained weight loading #mv_hrnet = get_multiview_pose_net(pose_hrnet, config) # Multiview adopting #depth_hrnet = get_pose_net(config, is_train=True) # 2.5D depth prediction model # Multi GPUs Setting gpus = [int(i) for i in config.GPUS.split(',')] mv_hrnet = torch.nn.DataParallel(mv_hrnet, device_ids=gpus).cuda() logger.info('=> init data parallel model') # Loss criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() logger.info('=> init criterion') # Optimizer optimizer = get_optimizer(config, mv_hrnet) logger.info('=> init {} optimizer'.format(config.TRAIN.OPTIMIZER)) # Loading checkpoint start_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME: start_epoch, mv_hrnet, optimizer = load_checkpoint( mv_hrnet, optimizer, final_output_dir) # Scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) logger.info('=> init scheduler') # Summary writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # Data loader normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) logger.info('=> loading train dataset') train_dataset = H36MDataset( config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize])) #train_dataset = MultiViewH36M(config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize])) logger.info('=> loading validation dataset') valid_dataset = H36MDataset( config, config.DATASET.TEST_SUBSET, False, transforms.Compose([transforms.ToTensor(), normalize])) logger.info('=> loading train dataloader') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) logger.info('=> loading valid dataloader') valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) # Training loop best_perf = 0.0 best_model = False for epoch in range(start_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() # Trainer train(config, train_loader, mv_hrnet, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # Performance indicator perf_indicator = validate(config, valid_loader, valid_dataset, mv_hrnet, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': mv_hrnet.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # End final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(mv_hrnet.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def training(self, epoch): if self.sw is None: self.sw = SummaryWriter(logdir=str(self.args.logs_path), flush_secs=5) tbar = tqdm(self.train_data, file=self.tqdm_out, ncols=100) self.metric.reset() train_loss = 0.0 iter_per_epoch = len(self.train_data) for i, (data, target) in enumerate(tbar): global_step = iter_per_epoch * epoch + i data = split_and_load(data, ctx_list=self.args.ctx, batch_axis=0, even_split=False) target = split_and_load(target, ctx_list=self.args.ctx, batch_axis=0, even_split=False) with autograd.record(True): if self.with_depth: outputs = [self.net(*X) for X in data] else: outputs = [self.net(X) for X in data] losses = [ self.criterion(*X, Y) for X, Y in zip(outputs, target) ] autograd.backward(losses) self.optimizer.step(self.args.batch_size) batch_loss = sum(loss.asnumpy()[0] for loss in losses) / len(losses) train_loss += batch_loss if self.image_dump_interval > 0 and global_step % self.image_dump_interval == 0: image_blob = data[0][0][0] if self.with_depth else data[0][0] image = self.denormalizator(image_blob.as_in_context( mx.cpu(0))).asnumpy() * 255 gt_mask = target[0][0].asnumpy() + self.trainset.pred_offset predicted_mask = mx.nd.squeeze( mx.nd.argmax(outputs[0][0][0], 0)).asnumpy() + self.trainset.pred_offset gt_mask = visualize_mask(gt_mask.astype(np.int32), self.trainset.NUM_CLASS + 1) predicted_mask = visualize_mask( predicted_mask.astype(np.int32), self.trainset.NUM_CLASS + 1) image = image.transpose((1, 2, 0)) if gt_mask.shape[:2] == image.shape[:2]: result = np.hstack( (image, gt_mask, predicted_mask)).transpose( (2, 0, 1)).astype(np.uint8) self.sw.add_image('Images/input_image', result, global_step=global_step) else: self.sw.add_image('Images/input_image', image.transpose( (2, 0, 1)).astype(np.uint8), global_step=global_step) result = np.hstack((gt_mask, predicted_mask)).transpose( (2, 0, 1)).astype(np.uint8) self.sw.add_image('Images/predicted', result, global_step=global_step) self.sw.add_scalar(tag='Loss/ce', value={ 'batch': batch_loss, 'epoch_avg': train_loss / (i + 1) }, global_step=global_step) self.sw.add_scalar(tag='learning_rate', value=self.lr_scheduler.learning_rate, global_step=global_step) if hasattr(self.criterion, 'k_sum'): self.sw.add_scalar(tag='nfl_mult', value=self.criterion.k_sum, global_step=global_step) tbar.set_description( f'Epoch {epoch}, training loss {train_loss/(i+1):.3f}') mx.nd.waitall() self.net.hybridize() save_checkpoint(self.net, self.args, epoch=None)
def main(): args = parse_args() reset_config(config, args) set_cudnn(config) seed = config.RANDOM_SEED np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU.strip() gpus = list(range(len(config.GPU.strip().split(',')))) logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg) summary_writer = SummaryWriter(log_dir=tb_log_dir) this_dir = osp.dirname(__file__) # backup the source code and the yaml config if args.cfg: shutil.copy(args.cfg, osp.join(final_output_dir, osp.basename(args.cfg))) if not osp.exists(osp.join(final_output_dir, "lib")): shutil.copytree(osp.join(this_dir, "../lib/"), osp.join(final_output_dir, "lib")) for k, v in config.items(): logger.info(f"{k}: {v}") # conditional import if config.TRAIN.FINETUNE_ROTATER: from lib.core.function3 import train, validate, evaluate elif config.TRAIN.USE_CYCLE: from lib.core.function2 import train, validate, evaluate else: from lib.core.function1 import train, validate, evaluate # build model logger.info('start building model.') if len(gpus) > 1: pose_model = torch.nn.DataParallel(get_pose_model(config)).cuda( gpus[0]) discriminator = torch.nn.DataParallel(get_discriminator(config)).cuda( gpus[0]) temp_discriminator = torch.nn.DataParallel( get_discriminator(config)).cuda(gpus[0]) else: pose_model = get_pose_model(config).cuda() discriminator = get_discriminator(config, is_temp=False).cuda() temp_discriminator = get_discriminator(config, is_temp=True).cuda() optimizer_g = get_optimizer(config, pose_model, is_dis=False) optimizer_d = get_optimizer(config, discriminator, is_dis=True) optimizer_d_temp = get_optimizer(config, temp_discriminator, is_dis=True, is_temp=True) step_size, gamma = config.TRAIN.SCHEDULER_STEP_SIZE, config.TRAIN.SCHEDULER_GAMMA scheduler_g = lr_scheduler.StepLR(optimizer_g, step_size=step_size, gamma=gamma) scheduler_d = lr_scheduler.StepLR(optimizer_d, step_size=step_size, gamma=gamma) scheduler_temp = lr_scheduler.StepLR(optimizer_d_temp, step_size=step_size, gamma=gamma) logger.info('finished building model.') # print out the model arch if config.TRAIN.PRETRAIN_LIFTER: print("Load pretrained lifter...") state_dict = torch.load( config.TRAIN.LIFTER_PRETRAIN_PATH)['pose_model_state_dict'] # state_dict = {k[7:]:v for k, v in state_dict.items()} pose_model.load_state_dict(state_dict, strict=False) if config.DATA.DATASET_NAME == 'surreal': loader_func = surreal else: loader_func = h36m if config.DATA.DATASET_NAME == "h36m" else mpiinf dataset_train = loader_func(config, is_train=True) dataset_test = loader_func(config, is_train=False) train_loader = DataLoader(dataset=dataset_train, batch_size=config.BATCH_SIZE, shuffle=True, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS) test_loader = DataLoader(dataset=dataset_test, batch_size=config.BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS) if args.eval: prefix = config.DATA.DATASET_NAME # for mode in ['train', 'valid']: for mode in ['valid']: is_train = True if mode == 'train' else False v3d_to_ours = [ 3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10 ] if prefix == "h36m" else np.arange(config.DATA.NUM_JOINTS) mpi2h36m = [ 10, 9, 8, 11, 12, 13, 4, 3, 2, 5, 6, 7, 1, 14, 15, 16, 0 ] if prefix == 'surreal': indices = np.arange(config.DATA.NUM_JOINTS) else: indices = v3d_to_ours if prefix == "h36m" else mpi2h36m mode = "train" if is_train else "valid" read_name = f"../data/{prefix}_{mode}_pred3.h5" # read_name = f"../../unsupervised_mesh/data/h36m_{mode}_pred_3d_mesh.h5" save_name = f"../data/{prefix}_{mode}_pred_3d.h5" if args.eval_suffix is not None: save_name = save_name[:-3] + "_" + args.eval_suffix + ".h5" # eval mode, load the pretrained model and generate the 3d prediction of all 3ds if not config.TRAIN.PRETRAIN_LIFTER: raise Warning( "You are not using a pretrain model... may be you can specify --pretrain flag" ) dataloader = DataLoader(dataset_train if mode == "train" else dataset_test, batch_size=config.BATCH_SIZE, \ shuffle=False, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS) all_out_data = evaluate(dataloader, pose_model, config, is_train=(mode == "train")) p1_mpjpe, p2_mpjpe = all_out_data['p1_mpjpe'], all_out_data[ 'p2_mpjpe'] # read out imagenames print("Reading imagenames and joints 2d...") fin = h5py.File(read_name, "r") fout = h5py.File(save_name, "w") imagenames = fin['imagename'][:].copy() joints_2d_gt = np.array(fin['joint_2d_gt']) fout['imagename'] = imagenames fout['joint_2d_gt'] = joints_2d_gt[:, indices] fout['joint_3d_gt'] = all_out_data['joint_3d_gt'] fout['joint_3d_pre'] = all_out_data['joint_3d_pre'] possible_same_keys = [ 'shape', 'pose', 'original_joint_2d_gt', 'joint_2d_pre', 'seqlen' ] for key in possible_same_keys: if key in fin.keys(): if 'joint' in key: fout[key] = np.array(fin[key])[:, indices] else: fout[key] = np.array(fin[key]) if 'seqname' in fin.keys(): fout['seqname'] = fin['seqname'][:].copy() if 'auc' in all_out_data.keys(): fout['auc'] = all_out_data['auc'] fout['pckh5'] = all_out_data['pckh5'] fout['auc_p2'] = all_out_data['auc_p2'] fout['pckh5_p2'] = all_out_data['pckh5_p2'] if 'scales' in all_out_data.keys(): fout['scale_pre'] = all_out_data['scales'] if 'scale_mids' in all_out_data.keys(): fout['scale_mid_pre'] = all_out_data['scale_mids'] fin.close() fout.close() print( "Evaluation on the {} set finished. P1 Mpjpe: {:.3f}, P2 Mpjpe: {:.3f}, saved to {}" .format("training" if is_train else "test", p1_mpjpe, p2_mpjpe, save_name)) if prefix == "mpi": print("[email protected]: {:.3f}, AUC: {:.3f}".format( all_out_data['pckh5'], all_out_data['auc'])) print("P2: [email protected]: {:.3f}, AUC: {:.3f}".format( all_out_data['pckh5_p2'], all_out_data['auc_p2'])) # uncomment this if you need to plot images # print("Rendering sequences...") # subprocess.call(f'python render.py --seq_num 10 --in_filename ../data/{prefix}_valid_pred_3d.h5 --save_dir ../vis', shell=True) return # preparation for visualization & perseq optimization(optional) if config.USE_GT: # note that the gt here is not the gt above(config.USE_GT) train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales.pkl" valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales.pkl" else: train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales_pre.pkl" valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales_pre.pkl" train_scale_mids_gt = load_pickle(train_path)['scale_mid'] if osp.exists( train_path) else None valid_scale_mids_gt = load_pickle(valid_path)['scale_mid'] if osp.exists( valid_path) else None train_seqnames, valid_seqnames = dataset_train.get_seqnames( ), dataset_test.get_seqnames() best_p1_mpjpe = best_p2_mpjpe = cur_p1_mpjpe = 10000.0 best_auc_val = best_pckh5 = 0.0 best_auc_val_p2 = best_pckh5_p2 = 0.0 for epoch in range(config.TRAIN.NUM_EPOCHS): scheduler_d.step() scheduler_g.step() scheduler_temp.step() # scheduler_s.step() avg_d_loss, avg_g_loss, avg_t_loss, train_scale_mids_pre = train( train_loader, pose_model, discriminator, temp_discriminator, optimizer_g, optimizer_d, optimizer_d_temp, epoch, config, summary_writer=summary_writer, print_interval=config.PRINT_INTERVAL) logger.info( "***** Epoch: {}, Avg G Loss: {:.3f}, Avg D Loss: {:.3f} Avg T Loss: {:.3f} *****" .format(epoch, avg_g_loss, avg_d_loss, avg_t_loss)) p1_mpjpe, p2_mpjpe, vis_image, valid_scale_mids_pre, extra_dict = validate( test_loader, pose_model, epoch, config) logger.info( "Epoch: {}, P1 Mpjpe/Best P1: {:.3f}/{:.3f}, P2 Mpjpe/Best P2/Cur P1: {:.3f}/{:.3f}/{:.3f}" .format(epoch, p1_mpjpe, best_p1_mpjpe, p2_mpjpe, best_p2_mpjpe, cur_p1_mpjpe)) if p2_mpjpe < best_p2_mpjpe: best_p2_mpjpe = p2_mpjpe cur_p1_mpjpe = p1_mpjpe is_best = True else: is_best = False if p1_mpjpe < best_p1_mpjpe: best_p1_mpjpe = p1_mpjpe if extra_dict is not None: auc_val, pckh5 = extra_dict['auc'], extra_dict['pckh5'] auc_val_p2, pckh5_p2 = extra_dict['auc_p2'], extra_dict['pckh5_p2'] if auc_val_p2 > best_auc_val_p2: best_auc_val_p2 = auc_val_p2 best_pckh5_p2 = pckh5_p2 is_best = True else: is_best = False if auc_val > best_auc_val: best_auc_val = auc_val best_pckh5 = pckh5 logger.info( "[email protected](Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})" .format(pckh5, best_pckh5, auc_val, best_auc_val)) logger.info( "P2: [email protected](Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})" .format(pckh5_p2, best_pckh5_p2, auc_val_p2, best_auc_val_p2)) save_checkpoint( { "epoch": epoch, "auc": best_auc_val, "pckh5": best_pckh5, "auc_p2": best_auc_val_p2, "pckh5_p2": best_pckh5_p2, "p1_mpjpe": p1_mpjpe, "p2_mpjpe": p2_mpjpe, "pose_model_state_dict": pose_model.state_dict(), "discriminator_state_dict": discriminator.state_dict(), "temp_discriminator_state_dict": temp_discriminator.state_dict(), "optimizer_d": optimizer_d.state_dict(), "optimizer_g": optimizer_g.state_dict(), "optimizer_d_temp": optimizer_d_temp.state_dict() }, is_best, final_output_dir) summary_writer.add_scalar("p1_mpjpe_3d_test/epoch", p1_mpjpe, epoch) summary_writer.add_scalar("p2_mpjpe_3d_test/epoch", p2_mpjpe, epoch) summary_writer.add_image("test_joints/epoch", vis_image, epoch) if extra_dict is not None: summary_writer.add_scalar("PCKh0.5/epoch", pckh5, epoch) summary_writer.add_scalar("AUC/epoch", auc_val, epoch) if train_scale_mids_gt is not None and train_scale_mids_pre is not None and len( train_scale_mids_pre) > 0: num_seq = config.VIS.SCALE_MID_NUM_SEQ vis_image_scale_mid1 = plot_scalemid_dist( train_scale_mids_pre, train_scale_mids_gt.tolist()) vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type( torch.float32).permute(2, 0, 1) / 255 vis_image_scale_mid2 = plot_scalemid_seq_dist( train_scale_mids_pre, train_scale_mids_gt.tolist(), train_seqnames, num_seq=num_seq) vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type( torch.float32).permute(2, 0, 1) / 255 summary_writer.add_image("train_scalemid_distribution/epoch", vis_image_scale_mid1, epoch) summary_writer.add_image("train_scalemid_seq_distribution/epoch", vis_image_scale_mid2, epoch) if valid_scale_mids_gt is not None and valid_scale_mids_pre is not None and len( valid_scale_mids_pre) > 0: num_seq = config.VIS.SCALE_MID_NUM_SEQ vis_image_scale_mid1 = plot_scalemid_dist( valid_scale_mids_pre, valid_scale_mids_gt.tolist()) vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type( torch.float32).permute(2, 0, 1) / 255 vis_image_scale_mid2 = plot_scalemid_seq_dist( valid_scale_mids_pre, valid_scale_mids_gt.tolist(), valid_seqnames, num_seq=num_seq) vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type( torch.float32).permute(2, 0, 1) / 255 summary_writer.add_image("valid_scalemid_distribution/epoch", vis_image_scale_mid1, epoch) summary_writer.add_image("valid_scalemid_seq_distribution/epoch", vis_image_scale_mid2, epoch) summary_writer.close()
def main(): args = parse_args() # set logger and dir logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.experiment_name, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # set cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # 目前仅支持单gpu,todo:增加多gpu支持 # set model and loss and criterion model = models.get_face_alignment_net(config) model = model.cuda(config.GPUS[0]) criterion = torch.nn.MSELoss(size_average=True).cuda(config.GPUS[0]) # criterion = AdaptiveWingLoss() optimizer = utils.get_optimizer(config, model) # get dataset dataset_type = get_dataset(config) # get dataloader train_loader = DataLoader(dataset=dataset_type(config, is_train=True), batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_loader = DataLoader(dataset=dataset_type(config, is_train=False), batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # set lr_scheduler last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) # set training writer writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # set training resume function if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") # starting training best_nme = 10000 for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() # traing function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluating nme, predictions = function.validate(config, val_loader, model, criterion, epoch, writer_dict) # saving is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): best_perf = 0.0 args = parse_args() reset_config(config, args) logger, final_output_dir = create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = models.pose3d_resnet.get_pose_net(config, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2(args.cfg, final_output_dir) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # define loss function (criterion) and optimizer loss_fn = eval('loss.' + config.LOSS.FN) criterion = loss_fn(num_joints=config.MODEL.NUM_JOINTS, norm=config.LOSS.NORM).cuda() # define training, validation and evaluation routines train = train_integral validate = validate_integral evaluate = eval_integral optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Resume from a trained model if not (config.MODEL.RESUME is ''): checkpoint = torch.load(config.MODEL.RESUME) if 'epoch' in checkpoint.keys(): config.TRAIN.BEGIN_EPOCH = checkpoint['epoch'] best_perf = checkpoint['perf'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('=> resume from pretrained model {}'.format( config.MODEL.RESUME)) else: model.load_state_dict(checkpoint) logger.info('=> resume from pretrained model {}'.format( config.MODEL.RESUME)) # Choose the dataset, either Human3.6M or mpii ds = eval('dataset.' + config.DATASET.DATASET) # Data loading code train_dataset = ds(cfg=config, root=config.DATASET.ROOT, image_set=config.DATASET.TRAIN_SET, is_train=True) valid_dataset = ds(cfg=config, root=config.DATASET.ROOT, image_set=config.DATASET.TEST_SET, is_train=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set preds_in_patch_with_score = validate(valid_loader, model) acc = evaluate(epoch, preds_in_patch_with_score, valid_loader, final_output_dir, debug=config.DEBUG.DEBUG) perf_indicator = 500. - acc if config.DATASET.DATASET == 'h36m' or 'mpii_3dhp' or 'jta' else acc if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file)
def main(): logger.info("Logger is set - training start") # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True if config.distributed: config.gpu = config.local_rank % torch.cuda.device_count() torch.cuda.set_device(config.gpu) # distributed init torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, world_size=config.world_size, rank=config.local_rank) config.world_size = torch.distributed.get_world_size() config.total_batch_size = config.world_size * config.batch_size else: config.total_batch_size = config.batch_size loaders, samplers = get_search_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers net_crit = nn.CrossEntropyLoss().cuda() controller = CDARTSController(config, net_crit, n_nodes=4, stem_multiplier=config.stem_multiplier) if config.param_pool_path is not None: param_pool = torch.load(config.param_pool_path, map_location='cpu') controller.load_state_dict(param_pool, strict=False) resume_state = None if config.resume: resume_state = torch.load(config.resume_path, map_location='cpu') sta_layer_idx = 0 if config.resume: controller.load_state_dict(resume_state['controller']) sta_layer_idx = resume_state['sta_layer_idx'] controller = controller.cuda() if config.sync_bn: if config.use_apex: controller = apex.parallel.convert_syncbn_model(controller) else: controller = torch.nn.SyncBatchNorm.convert_sync_batchnorm( controller) if config.use_apex: controller = DDP(controller, delay_allreduce=True) else: controller = DDP(controller, device_ids=[config.gpu]) # warm up model_search layer_idx = 0 if config.ensemble_param: w_optim = torch.optim.SGD( [{ "params": controller.module.feature_extractor.parameters() }, { "params": controller.module.super_layers[layer_idx].parameters(), 'lr': config.w_lr }, { "params": controller.module.super_layers[layer_idx + 1:].parameters() }, { "params": controller.module.fc_super.parameters() }, { "params": controller.module.distill_aux_head1.parameters() }, { "params": controller.module.distill_aux_head2.parameters() }, { "params": controller.module.ensemble_param }, { "params": controller.module.nas_layers[:layer_idx].parameters() }], lr=config.w_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) else: w_optim = torch.optim.SGD( [{ "params": controller.module.feature_extractor.parameters() }, { "params": controller.module.super_layers[layer_idx].parameters(), 'lr': config.w_lr }, { "params": controller.module.super_layers[layer_idx + 1:].parameters() }, { "params": controller.module.fc_super.parameters() }, { "params": controller.module.distill_aux_head1.parameters() }, { "params": controller.module.distill_aux_head2.parameters() }, { "params": controller.module.nas_layers[:layer_idx].parameters() }], lr=config.w_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) for layer_idx in range(sta_layer_idx, config.layer_num): if config.one_stage: if layer_idx > 0: break # clean arch params in model_search if config.clean_arch: controller.module.init_arch_params(layer_idx) # search training loop best_top1 = 0. best_genotypes = [] best_connects = [] sta_search_iter, sta_search_epoch = 0, 0 is_best = True if (layer_idx == sta_layer_idx) and (resume_state is not None): sta_search_iter = resume_state['sta_search_iter'] sta_search_epoch = resume_state['sta_search_epoch'] best_top1 = resume_state['best_top1'] best_genotypes = resume_state['best_genotypes'] best_connects = resume_state['best_connects'] else: # init model main if config.gumbel_sample: genotype, connect = controller.module.generate_genotype_gumbel( 0) else: genotype, connect = controller.module.generate_genotype(0) for i in range(config.layer_num): best_genotypes.append(genotype) best_connects.append(connect) for i in range(config.layer_num): controller.module.genotypes[i] = best_genotypes[i] controller.module.connects[i] = best_connects[i] lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.search_iter * config.search_iter_epochs, eta_min=config.w_lr_min) lr_scheduler_retrain = nn.ModuleList() alpha_optim = nn.ModuleList() optimizer = nn.ModuleList() sub_epoch = 0 for search_iter in range(sta_search_iter, config.search_iter): if search_iter < config.pretrain_epochs: if config.local_rank == 0: logger.info("####### Super model warmup #######") train_sampler.set_epoch(search_iter) retrain_warmup(train_loader, controller, w_optim, layer_idx, search_iter, writer, logger, True, config.pretrain_epochs, config) #lr_scheduler.step() else: # build new controller for i, genotype in enumerate(best_genotypes): controller.module.build_nas_layers(i, genotype, config.same_structure) controller_b = copy.deepcopy(controller.module) del controller controller = controller_b.cuda() controller.fix_pre_layers(layer_idx) #if search_iter > config.regular_ratio * config.search_iter: # config.regular = False # sync params from super layer pool for i in range(layer_idx, config.layer_num): controller.copy_params_from_super_layer(i) if config.sync_bn: if config.use_apex: controller = apex.parallel.convert_syncbn_model( controller) else: controller = torch.nn.SyncBatchNorm.convert_sync_batchnorm( controller) if config.use_apex: controller = DDP(controller, delay_allreduce=True) else: controller = DDP(controller, device_ids=[config.gpu]) # weights optimizer if config.ensemble_param: w_optim = torch.optim.SGD([{ "params": controller.module.feature_extractor.parameters() }, { "params": controller.module.super_layers[layer_idx].parameters(), 'lr': config.w_lr }, { "params": controller.module.super_layers[layer_idx + 1:].parameters() }, { "params": controller.module.fc_super.parameters() }, { "params": controller.module.distill_aux_head1.parameters() }, { "params": controller.module.distill_aux_head2.parameters() }, { "params": controller.module.ensemble_param }, { "params": controller.module.nas_layers[:layer_idx].parameters() }], lr=config.w_lr, momentum=config.w_momentum, weight_decay=config. w_weight_decay) else: w_optim = torch.optim.SGD([{ "params": controller.module.feature_extractor.parameters() }, { "params": controller.module.super_layers[layer_idx].parameters(), 'lr': config.w_lr }, { "params": controller.module.super_layers[layer_idx + 1:].parameters() }, { "params": controller.module.fc_super.parameters() }, { "params": controller.module.distill_aux_head1.parameters() }, { "params": controller.module.distill_aux_head2.parameters() }, { "params": controller.module.nas_layers[:layer_idx].parameters() }], lr=config.w_lr, momentum=config.w_momentum, weight_decay=config. w_weight_decay) # arch_params optimizer if config.repeat_cell: alpha_optim = torch.optim.Adam( controller.module.super_layers_arch[0].parameters(), config.alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay) else: alpha_optim = torch.optim.Adam( controller.module.super_layers_arch[layer_idx:]. parameters(), config.alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay) if config.ensemble_param: optimizer = torch.optim.SGD( [{ "params": controller.module.feature_extractor.parameters() }, { "params": controller.module.nas_layers.parameters(), 'lr': config.nasnet_lr * 0.1 if config.param_pool_path else config.nasnet_lr }, { "params": controller.module.ensemble_param }, { "params": controller.module.distill_aux_head1.parameters() }, { "params": controller.module.distill_aux_head2.parameters() }, { "params": controller.module.fc_nas.parameters() }], lr=config.nasnet_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) else: optimizer = torch.optim.SGD( [{ "params": controller.module.feature_extractor.parameters() }, { "params": controller.module.nas_layers.parameters(), 'lr': config.nasnet_lr * 0.1 if config.param_pool_path else config.nasnet_lr }, { "params": controller.module.distill_aux_head1.parameters() }, { "params": controller.module.distill_aux_head2.parameters() }, { "params": controller.module.fc_nas.parameters() }], lr=config.nasnet_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) lr_scheduler_retrain = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.search_iter_epochs, eta_min=config.w_lr_min) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.search_iter * config.search_iter_epochs, eta_min=config.w_lr_min) if (layer_idx == sta_layer_idx) and (resume_state is not None) and ( resume_state['sta_search_epoch'] > config.pretrain_epochs): w_optim.load_state_dict(resume_state['w_optim']) alpha_optim.load_state_dict(resume_state['alpha_optim']) lr_scheduler.load_state_dict(resume_state['lr_scheduler']) lr_scheduler_retrain.load_state_dict( resume_state['lr_scheduler_retrain']) else: # lr_scheduler pass #for i in range(search_iter * config.search_iter_epochs): # lr_scheduler.step() # warmup model main if config.local_rank == 0: logger.info("####### Sub model warmup #######") for warmup_epoch in range(config.nasnet_warmup): valid_sampler.set_epoch(warmup_epoch) retrain_warmup(valid_loader, controller, optimizer, layer_idx, warmup_epoch, writer, logger, False, config.nasnet_warmup, config) best_top1 = 0. sub_epoch = 0 for sub_epoch in range(sta_search_epoch, config.search_iter_epochs): lr_search = lr_scheduler.get_lr()[0] lr_main = lr_scheduler_retrain.get_lr()[0] search_epoch = search_iter * config.search_iter_epochs + sub_epoch # reset iterators train_sampler.set_epoch(search_epoch) valid_sampler.set_epoch(search_epoch) # training search(train_loader, valid_loader, controller, optimizer, w_optim, alpha_optim, layer_idx, search_epoch, writer, logger, config) # validation step_num = len(valid_loader) cur_step = (search_epoch + 1) * step_num top1 = 1. genotypes = [] connects = [] if config.gumbel_sample: genotype, connect = controller.module.generate_genotype_gumbel( 0) else: genotype, connect = controller.module.generate_genotype( 0) for i in range(config.layer_num): genotypes.append(genotype) connects.append(connect) if config.local_rank == 0: # for i in range(config.layer_num - layer_idx): # logger.info ("Stage: {} Layer: {}".format(layer_idx, i+layer_idx+1)) logger.info("Genotypes: ") # controller.module.print_arch_params(logger, i+layer_idx) controller.module.print_arch_params(logger, 0) for i in range(config.layer_num - layer_idx): if config.local_rank == 0: # genotype genotype = genotypes[i] logger.info( "Stage: {} Layer: {} genotype = {}".format( layer_idx, i + layer_idx + 1, genotype)) # genotype as a image plot_path = os.path.join( config.plot_path, "Stage_{}_Layer_{}_EP_{:02d}".format( layer_idx, layer_idx + i + 1, search_epoch + 1)) caption = "Stage_{}_Layer_{}_Epoch_{}".format( layer_idx, layer_idx + i + 1, search_epoch + 1) plot(genotype.normal, plot_path + "-normal", caption) plot(genotype.reduce, plot_path + "-reduce", caption) # sync params to super layer pool for i in range(layer_idx, config.layer_num): controller.module.copy_params_from_nas_layer(i) # save best_top1 = top1 best_genotypes = genotypes best_connects = connects for i in range(config.layer_num): controller.module.genotypes[i] = best_genotypes[i] controller.module.connects[i] = best_connects[i] #lr_scheduler.step() #lr_scheduler_retrain.step() if config.local_rank == 0: utils.save_checkpoint(controller.module, config.path, is_best) torch.save( { 'controller': controller.module.state_dict(), 'sta_layer_idx': layer_idx, 'w_optim': w_optim.state_dict(), 'alpha_optim': alpha_optim.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'sta_search_iter': search_iter, 'sta_search_epoch': sub_epoch + 1, 'best_top1': best_top1, 'best_genotypes': best_genotypes, 'best_connects': best_connects, 'lr_scheduler_retrain': lr_scheduler_retrain.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(config.path, 'search_resume.pth.tar')) torch.cuda.empty_cache() sta_search_epoch = 0 # clean del w_optim del alpha_optim del optimizer torch.cuda.empty_cache() config.pretrain_epochs = max( config.pretrain_epochs - config.pretrain_decay, 0) # genotype as a image for i in range(config.layer_num): genotype, connect = controller.module.generate_genotype(i) controller.module.genotypes[i] = genotype controller.module.connects[i] = connect if config.local_rank == 0: for layer_idx, genotype in controller.module.genotypes.items(): logger.info("layer_idx : {}".format(layer_idx + 1)) logger.info("genotype = {}".format(genotype)) plot_path = os.path.join( config.plot_path, "Final_Layer_{}_genotype".format(layer_idx + 1)) caption = "Layer_{}".format(layer_idx + 1) plot(genotype.normal, plot_path + "-normal", caption) plot(genotype.reduce, plot_path + "-reduce", caption) # save dict as json if config.local_rank == 0: for layer_idx, genotype in controller.module.genotypes.items(): controller.module.genotypes[layer_idx] = str(genotype) js = json.dumps(controller.module.genotypes) file = open('genotypes.json', 'w') file.write(js) file.close()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) dataset_type = get_dataset(config) train_data = dataset_type(config, split="train") train_loader = DataLoader(dataset=train_data, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_data = dataset_type(config, split="valid") val_loader = DataLoader(dataset=val_data, batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # config.MODEL.NUM_JOINTS = train_data.get_num_points() model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } model = nn.DataParallel(model, device_ids=gpus).cuda() # loss criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'final.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") loss = [] for epoch in range(last_epoch, config.TRAIN.END_EPOCH): losses, diff = function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) loss.append(losses) lr_scheduler.step() np.save( os.path.join(final_output_dir, "train_diff@epoch{}".format(epoch)), diff) # evaluate nme, predictions, diff = function.validate(config, val_loader, model, criterion, epoch, writer_dict) np.save( os.path.join(final_output_dir, "valid_diff@epoch{}".format(epoch)), diff) is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) if is_best: for i in range(len(predictions)): afile = val_data.annotation_files[i] new_afile = '{}.{}.txt'.format( afile, os.path.basename(args.cfg).split('.')[0]) with open(new_afile, 'wt') as f: pts = predictions[i].cpu().numpy() for j in range(len(pts)): f.write("{},{}\n".format( pts[j][1] / val_data.factor[1], pts[j][0] / val_data.factor[0])) pd.DataFrame(data=loss).to_csv('loss2.csv') final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): logger.info("Logger is set - training start") # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True if config.distributed: config.gpu = config.local_rank % torch.cuda.device_count() torch.cuda.set_device(config.gpu) # distributed init torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, world_size=config.world_size, rank=config.local_rank) config.world_size = torch.distributed.get_world_size() config.total_batch_size = config.world_size * config.batch_size else: config.total_batch_size = config.batch_size loaders, samplers = get_augment_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers net_crit = nn.CrossEntropyLoss().cuda() controller = CDARTSController(config, net_crit, n_nodes=4, stem_multiplier=config.stem_multiplier) file = open(config.cell_file, 'r') js = file.read() r_dict = json.loads(js) if config.local_rank == 0: logger.info(r_dict) file.close() genotypes_dict = {} for layer_idx, genotype in r_dict.items(): genotypes_dict[int(layer_idx)] = gt.from_str(genotype) controller.build_augment_model(controller.init_channel, genotypes_dict) resume_state = None if config.resume: resume_state = torch.load(config.resume_path, map_location='cpu') controller.model_main.load_state_dict(resume_state['model_main']) controller.model_main = controller.model_main.cuda() param_size = utils.param_size(controller.model_main) logger.info("param size = %fMB", param_size) # change training hyper parameters according to cell type if 'cifar' in config.dataset: if param_size < 3.0: config.weight_decay = 3e-4 config.drop_path_prob = 0.2 elif param_size > 3.0 and param_size < 3.5: config.weight_decay = 3e-4 config.drop_path_prob = 0.3 else: config.weight_decay = 5e-4 config.drop_path_prob = 0.3 if config.local_rank == 0: logger.info("Current weight decay: {}".format(config.weight_decay)) logger.info("Current drop path prob: {}".format(config.drop_path_prob)) controller.model_main = apex.parallel.convert_syncbn_model( controller.model_main) # weights optimizer optimizer = torch.optim.SGD(controller.model_main.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # optimizer = torch.optim.SGD(controller.model_main.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay, nesterov=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) if config.use_amp: controller.model_main, optimizer = amp.initialize( controller.model_main, optimizer, opt_level=config.opt_level) if config.distributed: controller.model_main = DDP(controller.model_main, delay_allreduce=True) best_top1 = 0. best_top5 = 0. sta_epoch = 0 # training loop if config.resume: optimizer.load_state_dict(resume_state['optimizer']) lr_scheduler.load_state_dict(resume_state['lr_scheduler']) best_top1 = resume_state['best_top1'] best_top5 = resume_state['best_top5'] sta_epoch = resume_state['sta_epoch'] epoch_pool = [220, 230, 235, 240, 245] for epoch in range(sta_epoch, config.epochs): # reset iterators train_sampler.set_epoch(epoch) valid_sampler.set_epoch(epoch) current_lr = lr_scheduler.get_lr()[0] # current_lr = utils.adjust_lr(optimizer, epoch, config) if config.local_rank == 0: logger.info('Epoch: %d lr %e', epoch, current_lr) if epoch < config.warmup_epochs and config.total_batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 if config.local_rank == 0: logger.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) drop_prob = config.drop_path_prob * epoch / config.epochs controller.model_main.module.drop_path_prob(drop_prob) # training train(train_loader, controller.model_main, optimizer, epoch, writer, logger, config) # validation cur_step = (epoch + 1) * len(train_loader) top1, top5 = validate(valid_loader, controller.model_main, epoch, cur_step, writer, logger, config) if 'cifar' in config.dataset: lr_scheduler.step() elif 'imagenet' in config.dataset: lr_scheduler.step() # current_lr = utils.adjust_lr(optimizer, epoch, config) else: raise Exception('Lr error!') # save if best_top1 < top1: best_top1 = top1 best_top5 = top5 is_best = True else: is_best = False # save if config.local_rank == 0: if ('imagenet' in config.dataset) and ((epoch + 1) in epoch_pool) and ( not config.resume) and (config.local_rank == 0): torch.save( { "model_main": controller.model_main.module.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "best_top1": best_top1, "best_top5": best_top5, "sta_epoch": epoch + 1 }, os.path.join(config.path, "epoch_{}.pth.tar".format(epoch + 1))) utils.save_checkpoint( controller.model_main.module.state_dict(), config.path, is_best) torch.save( { "model_main": controller.model_main.module.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "best_top1": best_top1, "best_top5": best_top5, "sta_epoch": epoch + 1 }, os.path.join(config.path, "retrain_resume.pth.tar")) utils.save_checkpoint(controller.model_main.module.state_dict(), config.path, is_best) if config.local_rank == 0: logger.info("Final best Prec@1 = {:.4%}, Prec@5 = {:.4%}".format( best_top1, best_top5))