def main(cfg): if cfg.SEED_VALUE >= 0: print(f'Seed value for the experiment {cfg.SEED_VALUE}') os.environ['PYTHONHASHSEED'] = str(cfg.SEED_VALUE) random.seed(cfg.SEED_VALUE) torch.manual_seed(cfg.SEED_VALUE) np.random.seed(cfg.SEED_VALUE) logger = create_logger(cfg.LOGDIR, phase='train') ''' logger.info(f'GPU name -> {torch.cuda.get_device_name()}') logger.info(f'GPU feat -> {torch.cuda.get_device_properties("cuda")}') logger.info(pprint.pformat(cfg)) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED writer = SummaryWriter(log_dir=cfg.LOGDIR) writer.add_text('config', pprint.pformat(cfg), 0) ''' # ========= Dataloaders ========= # data_loaders = get_data_loaders(cfg) # ========= Compile Loss ========= # loss = VIBELoss( e_pose_loss_weight=cfg.LOSS.POSE_W, e_shape_loss_weight=cfg.LOSS.SHAPE_W, ) print(loss) # ========= Initialize networks, optimizers and lr_schedulers ========= # # temporal generator include the neural network ResNet50,temporal encoder and smpl regressor # VIBE generator = VIBE_Demo( batch_size=cfg.TRAIN.BATCH_SIZE, # 小批量训练 64 ) print(generator) a = torch.ones((1, 2048)) z = generator(a) for c in z: for key in c.keys(): print(key) print(c[key].shape) # 定义generator 模型优化算法 常见的优化算法有: sgd,adam gen_optimizer = get_optimizer( model=generator, # 模型 optim_type=cfg.TRAIN.GEN_OPTIM, # 使用优化算法的类型,有:sgd,adam lr=cfg.TRAIN.GEN_LR, # 学习率 weight_decay=cfg.TRAIN.GEN_WD, # regularization 超参数设置 momentum=cfg.TRAIN.GEN_MOMENTUM, # 动量法 超参数 )
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) dataset_type = get_dataset(config) train_data = dataset_type(config, is_train=True) train_loader = DataLoader(dataset=train_data, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_loader = DataLoader(dataset=dataset_type(config, is_train=False), batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # config.MODEL.NUM_JOINTS = train_data.get_num_points() model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } model = nn.DataParallel(model, device_ids=gpus).cuda() # loss criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluate nme, predictions = function.validate(config, val_loader, model, criterion, epoch, writer_dict) is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(cfg): if cfg.SEED_VALUE >= 0: print(f'Seed value for the experiment {cfg.SEED_VALUE}') os.environ['PYTHONHASHSEED'] = str(cfg.SEED_VALUE) random.seed(cfg.SEED_VALUE) torch.manual_seed(cfg.SEED_VALUE) np.random.seed(cfg.SEED_VALUE) logger = create_logger(cfg.LOGDIR, phase='train') logger.info(f'GPU name -> {torch.cuda.get_device_name()}') logger.info(f'GPU feat -> {torch.cuda.get_device_properties("cuda")}') logger.info(pprint.pformat(cfg)) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED writer = SummaryWriter(log_dir=cfg.LOGDIR) writer.add_text('config', pprint.pformat(cfg), 0) # ========= Dataloaders ========= # data_loaders = get_data_loaders(cfg) # ========= Compile Loss ========= # loss = VIBELoss( e_pose_loss_weight=cfg.LOSS.POSE_W, e_shape_loss_weight=cfg.LOSS.SHAPE_W, ) # ========= Initialize networks, optimizers and lr_schedulers ========= # # temporal generator include the neural network ResNet50,temporal encoder and smpl regressor # VIBE generator = VIBE_Demo( batch_size=cfg.TRAIN.BATCH_SIZE, # 小批量训练 64 ).to(cfg.DEVICE) # 定义generator 模型优化算法 常见的优化算法有: sgd,adam gen_optimizer = get_optimizer( model=generator, # 模型 optim_type=cfg.TRAIN.GEN_OPTIM, # 使用优化算法的类型,有:sgd,adam lr=cfg.TRAIN.GEN_LR, # 学习率 weight_decay=cfg.TRAIN.GEN_WD, # regularization 超参数设置 momentum=cfg.TRAIN.GEN_MOMENTUM, # 动量法 超参数 ) # generator 中学习率的调整 lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( gen_optimizer, mode='min', factor=0.1, patience=cfg.TRAIN.LR_PATIENCE, verbose=True, ) # ========= Start Training ========= # Trainer( data_loaders=data_loaders, generator=generator, criterion=loss, gen_optimizer=gen_optimizer, start_epoch=cfg.TRAIN.START_EPOCH, end_epoch=cfg.TRAIN.END_EPOCH, device=cfg.DEVICE, writer=writer, debug=cfg.DEBUG, logdir=cfg.LOGDIR, lr_scheduler=lr_scheduler, num_iters_per_epoch=cfg.TRAIN.NUM_ITERS_PER_EPOCH, debug_freq=cfg.DEBUG_FREQ, ).fit()
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.'+config.MODEL.NAME+'.get_seg_net')( config, is_train=True ) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, 'vis_global_steps': 0, } # dump_input = torch.rand((config.TRAIN.BATCH_SIZE, # 3, # config.MODEL.IMAGE_SIZE[1], # config.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() optimizer = get_optimizer(config, model) # Data loading code if 'xception' in config.MODEL.NAME: # Xception uses different mean std for input image normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_augs = aug.Compose([aug.RandomScale(0.5, 2.0), aug.RandomHorizontallyFlip(0.5), aug.RandomSizedCrop(config.MODEL.IMAGE_SIZE)]) test_augs = None train_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), augmentations=train_augs ) valid_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), augmentations=test_augs ) # define loss function (criterion) and optimizer criterion = CrossEntropy2D(ignore_index=255, weight=train_dataset.class_weights).cuda() train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True, drop_last=True if len(gpus) > 2 else False # PyTorch's DataParallel model cannot handle 0 image on either of the GPUs ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True ) if config.TRAIN.LR_SCHEDULER == 'multistep': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR ) elif config.TRAIN.LR_SCHEDULER == 'poly': max_iter = config.TRAIN.END_EPOCH * len(train_loader) lr_scheduler = PolynomialLR(optimizer, max_iter=max_iter, decay_iter=1) elif config.TRAIN.LR_SCHEDULER == 'none': lr_scheduler = None else: raise ValueError('Scheduler {} not supported'.format(config.TRAIN.LR_SCHEDULER)) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): if config.TRAIN.LR_SCHEDULER == 'multistep': lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, lr_scheduler, epoch, final_output_dir, tb_log_dir, writer_dict) if (epoch + 1) % config.TRAIN.EVAL_INTERVAL == 0: if not config.MODEL.LEARN_GAMMA: if float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) <= 1: gamma = (config.TRAIN.NE_GAMMA_U - config.TRAIN.NE_GAMMA_L) * \ (1 - float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) ) ** \ config.TRAIN.NE_GAMMA_EXP + config.TRAIN.NE_GAMMA_L else: gamma = config.TRAIN.NE_GAMMA_L else: gamma = None # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict, gamma=gamma) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) else: perf_indicator = 0.0 final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.gpu) reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('{}.get_pose_net'.format(args.model))(config, is_train=True) model.eval() params = count_parameters_in_MB(model) logger.info("Params = %.2fMB" % params) mult_adds = comp_multadds(model, input_size=(3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) logger.info("Mult-Adds = %.2fMB" % mult_adds) model.train() model = model.cuda() # copy model file # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir) # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) lr_scheduler.step()
def main(): # load config config = parse_arg() # create output folder output_dict = utils.create_log_folder(config, phase='train') # cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # writer dict writer_dict = { 'writer': SummaryWriter(log_dir=output_dict['tb_dir']), 'train_global_steps': 0, 'valid_global_steps': 0, } # construct face related neural networks model = crnn.get_crnn(config) # # checkpoint = torch.load('/data/yolov5/CRNN_Chinese_Characters_Rec/output/OWN/crnn/2020-09-15-22-13/checkpoints/checkpoint_98_acc_1.0983.pth') # if 'state_dict' in checkpoint.keys(): # model.load_state_dict(checkpoint['state_dict']) # else: # model.load_state_dict(checkpoint) # get device if torch.cuda.is_available(): device = torch.device("cuda:{}".format(config.GPUID)) else: device = torch.device("cpu:0") model = model.to(device) # define loss function # criterion = torch.nn.CTCLoss() criterion = CTCLoss() last_epoch = config.TRAIN.BEGIN_EPOCH optimizer = utils.get_optimizer(config, model) if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.FINETUNE.IS_FINETUNE: model_state_file = config.TRAIN.FINETUNE.FINETUNE_CHECKPOINIT if model_state_file == '': print(" => no checkpoint found") checkpoint = torch.load(model_state_file, map_location='cpu') if 'state_dict' in checkpoint.keys(): checkpoint = checkpoint['state_dict'] from collections import OrderedDict model_dict = OrderedDict() for k, v in checkpoint.items(): if 'cnn' in k: model_dict[k[4:]] = v model.cnn.load_state_dict(model_dict) if config.TRAIN.FINETUNE.FREEZE: for p in model.cnn.parameters(): p.requires_grad = False elif config.TRAIN.RESUME.IS_RESUME: model_state_file = config.TRAIN.RESUME.FILE if model_state_file == '': print(" => no checkpoint found") checkpoint = torch.load(model_state_file, map_location='cpu') if 'state_dict' in checkpoint.keys(): model.load_state_dict(checkpoint['state_dict']) last_epoch = checkpoint['epoch'] # optimizer.load_state_dict(checkpoint['optimizer']) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) else: model.load_state_dict(checkpoint) model_info(model) train_dataset = get_dataset(config)(config, is_train=True) train_loader = DataLoader( dataset=train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) val_dataset = get_dataset(config)(config, is_train=False) val_loader = DataLoader( dataset=val_dataset, batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=config.TEST.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) best_acc = 0.5 converter = utils.strLabelConverter(config.DATASET.ALPHABETS) for epoch in range(last_epoch, config.TRAIN.END_EPOCH): function.train(config, train_loader, train_dataset, converter, model, criterion, optimizer, device, epoch, writer_dict, output_dict) lr_scheduler.step() acc = function.validate(config, val_loader, val_dataset, converter, model, criterion, device, epoch, writer_dict, output_dict) is_best = acc > best_acc best_acc = max(acc, best_acc) print("is best:", is_best) print("best acc is:", best_acc) # save checkpoint torch.save( { "state_dict": model.state_dict(), "epoch": epoch + 1, # "optimizer": optimizer.state_dict(), # "lr_scheduler": lr_scheduler.state_dict(), "best_acc": best_acc, }, os.path.join(output_dict['chs_dir'], "checkpoint_{}_acc_{:.4f}.pth".format(epoch, acc))) writer_dict['writer'].close()
def main(): args = parse_args() update_config(cfg, args) logger, final_output_dir, tb_log_dir = create_logger( cfg, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(cfg) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir) # logger.info(pprint.pformat(model)) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input)) logger.info(get_model_summary(model, dump_input)) model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda() # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True, transforms.Compose([transforms.ToTensor(), normalize])) valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False, transforms.Compose([transforms.ToTensor(), normalize])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=cfg.TRAIN.SHUFFLE, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) best_perf = 0.0 best_model = False last_epoch = -1 optimizer = get_optimizer(cfg, model) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) lr_scheduler.step() # evaluate on validation set perf_indicator = validate(cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
pytorch_total_params = sum(p.numel() for p in model.parameters()) print('TOTAL TRAINABLE parameters', pytorch_total_TR_params) print('TOTAL parameters', pytorch_total_params) ## MY loss loss = VIBELoss2( e_loss_weight=1.0, e_3d_loss_weight=50.0, e_pose_loss_weight=100.0, ) gen_optimizer = get_optimizer( model=model, optim_type='Adam', lr=0.0001, weight_decay=0.0, momentum=0.9, ) ## EXTRA things bewlow #import torch import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from cv2 import projectPoints ''' GLOBAL VARIABLES ''' angle_idx = 0 # Bone angle to adjust direction = 0 # Direction to rotate, (0 - x, 1 - y, 2 - z) for upper arm only step = 3 # 3 degrees for step size step_radian = step * np.pi / 180
def main(): final_output_dir = 'output' args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger(config, 'train') logger.info(pprint.pformat(config)) # CuDNN cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # HRNet Model mv_hrnet = get_pose_net(config, is_train=True) #pose_hrnet = get_pose_net(config, is_train=True) # Pose estimation model #pose_hrnet.load_state_dict(torch.load(config.NETWORK.PRETRAINED), strict=False) # Pretrained weight loading #mv_hrnet = get_multiview_pose_net(pose_hrnet, config) # Multiview adopting #depth_hrnet = get_pose_net(config, is_train=True) # 2.5D depth prediction model # Multi GPUs Setting gpus = [int(i) for i in config.GPUS.split(',')] mv_hrnet = torch.nn.DataParallel(mv_hrnet, device_ids=gpus).cuda() logger.info('=> init data parallel model') # Loss criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() logger.info('=> init criterion') # Optimizer optimizer = get_optimizer(config, mv_hrnet) logger.info('=> init {} optimizer'.format(config.TRAIN.OPTIMIZER)) # Loading checkpoint start_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME: start_epoch, mv_hrnet, optimizer = load_checkpoint( mv_hrnet, optimizer, final_output_dir) # Scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) logger.info('=> init scheduler') # Summary writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # Data loader normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) logger.info('=> loading train dataset') train_dataset = H36MDataset( config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize])) #train_dataset = MultiViewH36M(config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize])) logger.info('=> loading validation dataset') valid_dataset = H36MDataset( config, config.DATASET.TEST_SUBSET, False, transforms.Compose([transforms.ToTensor(), normalize])) logger.info('=> loading train dataloader') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) logger.info('=> loading valid dataloader') valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) # Training loop best_perf = 0.0 best_model = False for epoch in range(start_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() # Trainer train(config, train_loader, mv_hrnet, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # Performance indicator perf_indicator = validate(config, valid_loader, valid_dataset, mv_hrnet, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': mv_hrnet.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # End final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(mv_hrnet.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() reset_config(config, args) set_cudnn(config) seed = config.RANDOM_SEED np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU.strip() gpus = list(range(len(config.GPU.strip().split(',')))) logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg) summary_writer = SummaryWriter(log_dir=tb_log_dir) this_dir = osp.dirname(__file__) # backup the source code and the yaml config if args.cfg: shutil.copy(args.cfg, osp.join(final_output_dir, osp.basename(args.cfg))) if not osp.exists(osp.join(final_output_dir, "lib")): shutil.copytree(osp.join(this_dir, "../lib/"), osp.join(final_output_dir, "lib")) for k, v in config.items(): logger.info(f"{k}: {v}") # conditional import if config.TRAIN.FINETUNE_ROTATER: from lib.core.function3 import train, validate, evaluate elif config.TRAIN.USE_CYCLE: from lib.core.function2 import train, validate, evaluate else: from lib.core.function1 import train, validate, evaluate # build model logger.info('start building model.') if len(gpus) > 1: pose_model = torch.nn.DataParallel(get_pose_model(config)).cuda( gpus[0]) discriminator = torch.nn.DataParallel(get_discriminator(config)).cuda( gpus[0]) temp_discriminator = torch.nn.DataParallel( get_discriminator(config)).cuda(gpus[0]) else: pose_model = get_pose_model(config).cuda() discriminator = get_discriminator(config, is_temp=False).cuda() temp_discriminator = get_discriminator(config, is_temp=True).cuda() optimizer_g = get_optimizer(config, pose_model, is_dis=False) optimizer_d = get_optimizer(config, discriminator, is_dis=True) optimizer_d_temp = get_optimizer(config, temp_discriminator, is_dis=True, is_temp=True) step_size, gamma = config.TRAIN.SCHEDULER_STEP_SIZE, config.TRAIN.SCHEDULER_GAMMA scheduler_g = lr_scheduler.StepLR(optimizer_g, step_size=step_size, gamma=gamma) scheduler_d = lr_scheduler.StepLR(optimizer_d, step_size=step_size, gamma=gamma) scheduler_temp = lr_scheduler.StepLR(optimizer_d_temp, step_size=step_size, gamma=gamma) logger.info('finished building model.') # print out the model arch if config.TRAIN.PRETRAIN_LIFTER: print("Load pretrained lifter...") state_dict = torch.load( config.TRAIN.LIFTER_PRETRAIN_PATH)['pose_model_state_dict'] # state_dict = {k[7:]:v for k, v in state_dict.items()} pose_model.load_state_dict(state_dict, strict=False) if config.DATA.DATASET_NAME == 'surreal': loader_func = surreal else: loader_func = h36m if config.DATA.DATASET_NAME == "h36m" else mpiinf dataset_train = loader_func(config, is_train=True) dataset_test = loader_func(config, is_train=False) train_loader = DataLoader(dataset=dataset_train, batch_size=config.BATCH_SIZE, shuffle=True, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS) test_loader = DataLoader(dataset=dataset_test, batch_size=config.BATCH_SIZE, shuffle=False, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS) if args.eval: prefix = config.DATA.DATASET_NAME # for mode in ['train', 'valid']: for mode in ['valid']: is_train = True if mode == 'train' else False v3d_to_ours = [ 3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10 ] if prefix == "h36m" else np.arange(config.DATA.NUM_JOINTS) mpi2h36m = [ 10, 9, 8, 11, 12, 13, 4, 3, 2, 5, 6, 7, 1, 14, 15, 16, 0 ] if prefix == 'surreal': indices = np.arange(config.DATA.NUM_JOINTS) else: indices = v3d_to_ours if prefix == "h36m" else mpi2h36m mode = "train" if is_train else "valid" read_name = f"../data/{prefix}_{mode}_pred3.h5" # read_name = f"../../unsupervised_mesh/data/h36m_{mode}_pred_3d_mesh.h5" save_name = f"../data/{prefix}_{mode}_pred_3d.h5" if args.eval_suffix is not None: save_name = save_name[:-3] + "_" + args.eval_suffix + ".h5" # eval mode, load the pretrained model and generate the 3d prediction of all 3ds if not config.TRAIN.PRETRAIN_LIFTER: raise Warning( "You are not using a pretrain model... may be you can specify --pretrain flag" ) dataloader = DataLoader(dataset_train if mode == "train" else dataset_test, batch_size=config.BATCH_SIZE, \ shuffle=False, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS) all_out_data = evaluate(dataloader, pose_model, config, is_train=(mode == "train")) p1_mpjpe, p2_mpjpe = all_out_data['p1_mpjpe'], all_out_data[ 'p2_mpjpe'] # read out imagenames print("Reading imagenames and joints 2d...") fin = h5py.File(read_name, "r") fout = h5py.File(save_name, "w") imagenames = fin['imagename'][:].copy() joints_2d_gt = np.array(fin['joint_2d_gt']) fout['imagename'] = imagenames fout['joint_2d_gt'] = joints_2d_gt[:, indices] fout['joint_3d_gt'] = all_out_data['joint_3d_gt'] fout['joint_3d_pre'] = all_out_data['joint_3d_pre'] possible_same_keys = [ 'shape', 'pose', 'original_joint_2d_gt', 'joint_2d_pre', 'seqlen' ] for key in possible_same_keys: if key in fin.keys(): if 'joint' in key: fout[key] = np.array(fin[key])[:, indices] else: fout[key] = np.array(fin[key]) if 'seqname' in fin.keys(): fout['seqname'] = fin['seqname'][:].copy() if 'auc' in all_out_data.keys(): fout['auc'] = all_out_data['auc'] fout['pckh5'] = all_out_data['pckh5'] fout['auc_p2'] = all_out_data['auc_p2'] fout['pckh5_p2'] = all_out_data['pckh5_p2'] if 'scales' in all_out_data.keys(): fout['scale_pre'] = all_out_data['scales'] if 'scale_mids' in all_out_data.keys(): fout['scale_mid_pre'] = all_out_data['scale_mids'] fin.close() fout.close() print( "Evaluation on the {} set finished. P1 Mpjpe: {:.3f}, P2 Mpjpe: {:.3f}, saved to {}" .format("training" if is_train else "test", p1_mpjpe, p2_mpjpe, save_name)) if prefix == "mpi": print("[email protected]: {:.3f}, AUC: {:.3f}".format( all_out_data['pckh5'], all_out_data['auc'])) print("P2: [email protected]: {:.3f}, AUC: {:.3f}".format( all_out_data['pckh5_p2'], all_out_data['auc_p2'])) # uncomment this if you need to plot images # print("Rendering sequences...") # subprocess.call(f'python render.py --seq_num 10 --in_filename ../data/{prefix}_valid_pred_3d.h5 --save_dir ../vis', shell=True) return # preparation for visualization & perseq optimization(optional) if config.USE_GT: # note that the gt here is not the gt above(config.USE_GT) train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales.pkl" valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales.pkl" else: train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales_pre.pkl" valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales_pre.pkl" train_scale_mids_gt = load_pickle(train_path)['scale_mid'] if osp.exists( train_path) else None valid_scale_mids_gt = load_pickle(valid_path)['scale_mid'] if osp.exists( valid_path) else None train_seqnames, valid_seqnames = dataset_train.get_seqnames( ), dataset_test.get_seqnames() best_p1_mpjpe = best_p2_mpjpe = cur_p1_mpjpe = 10000.0 best_auc_val = best_pckh5 = 0.0 best_auc_val_p2 = best_pckh5_p2 = 0.0 for epoch in range(config.TRAIN.NUM_EPOCHS): scheduler_d.step() scheduler_g.step() scheduler_temp.step() # scheduler_s.step() avg_d_loss, avg_g_loss, avg_t_loss, train_scale_mids_pre = train( train_loader, pose_model, discriminator, temp_discriminator, optimizer_g, optimizer_d, optimizer_d_temp, epoch, config, summary_writer=summary_writer, print_interval=config.PRINT_INTERVAL) logger.info( "***** Epoch: {}, Avg G Loss: {:.3f}, Avg D Loss: {:.3f} Avg T Loss: {:.3f} *****" .format(epoch, avg_g_loss, avg_d_loss, avg_t_loss)) p1_mpjpe, p2_mpjpe, vis_image, valid_scale_mids_pre, extra_dict = validate( test_loader, pose_model, epoch, config) logger.info( "Epoch: {}, P1 Mpjpe/Best P1: {:.3f}/{:.3f}, P2 Mpjpe/Best P2/Cur P1: {:.3f}/{:.3f}/{:.3f}" .format(epoch, p1_mpjpe, best_p1_mpjpe, p2_mpjpe, best_p2_mpjpe, cur_p1_mpjpe)) if p2_mpjpe < best_p2_mpjpe: best_p2_mpjpe = p2_mpjpe cur_p1_mpjpe = p1_mpjpe is_best = True else: is_best = False if p1_mpjpe < best_p1_mpjpe: best_p1_mpjpe = p1_mpjpe if extra_dict is not None: auc_val, pckh5 = extra_dict['auc'], extra_dict['pckh5'] auc_val_p2, pckh5_p2 = extra_dict['auc_p2'], extra_dict['pckh5_p2'] if auc_val_p2 > best_auc_val_p2: best_auc_val_p2 = auc_val_p2 best_pckh5_p2 = pckh5_p2 is_best = True else: is_best = False if auc_val > best_auc_val: best_auc_val = auc_val best_pckh5 = pckh5 logger.info( "[email protected](Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})" .format(pckh5, best_pckh5, auc_val, best_auc_val)) logger.info( "P2: [email protected](Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})" .format(pckh5_p2, best_pckh5_p2, auc_val_p2, best_auc_val_p2)) save_checkpoint( { "epoch": epoch, "auc": best_auc_val, "pckh5": best_pckh5, "auc_p2": best_auc_val_p2, "pckh5_p2": best_pckh5_p2, "p1_mpjpe": p1_mpjpe, "p2_mpjpe": p2_mpjpe, "pose_model_state_dict": pose_model.state_dict(), "discriminator_state_dict": discriminator.state_dict(), "temp_discriminator_state_dict": temp_discriminator.state_dict(), "optimizer_d": optimizer_d.state_dict(), "optimizer_g": optimizer_g.state_dict(), "optimizer_d_temp": optimizer_d_temp.state_dict() }, is_best, final_output_dir) summary_writer.add_scalar("p1_mpjpe_3d_test/epoch", p1_mpjpe, epoch) summary_writer.add_scalar("p2_mpjpe_3d_test/epoch", p2_mpjpe, epoch) summary_writer.add_image("test_joints/epoch", vis_image, epoch) if extra_dict is not None: summary_writer.add_scalar("PCKh0.5/epoch", pckh5, epoch) summary_writer.add_scalar("AUC/epoch", auc_val, epoch) if train_scale_mids_gt is not None and train_scale_mids_pre is not None and len( train_scale_mids_pre) > 0: num_seq = config.VIS.SCALE_MID_NUM_SEQ vis_image_scale_mid1 = plot_scalemid_dist( train_scale_mids_pre, train_scale_mids_gt.tolist()) vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type( torch.float32).permute(2, 0, 1) / 255 vis_image_scale_mid2 = plot_scalemid_seq_dist( train_scale_mids_pre, train_scale_mids_gt.tolist(), train_seqnames, num_seq=num_seq) vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type( torch.float32).permute(2, 0, 1) / 255 summary_writer.add_image("train_scalemid_distribution/epoch", vis_image_scale_mid1, epoch) summary_writer.add_image("train_scalemid_seq_distribution/epoch", vis_image_scale_mid2, epoch) if valid_scale_mids_gt is not None and valid_scale_mids_pre is not None and len( valid_scale_mids_pre) > 0: num_seq = config.VIS.SCALE_MID_NUM_SEQ vis_image_scale_mid1 = plot_scalemid_dist( valid_scale_mids_pre, valid_scale_mids_gt.tolist()) vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type( torch.float32).permute(2, 0, 1) / 255 vis_image_scale_mid2 = plot_scalemid_seq_dist( valid_scale_mids_pre, valid_scale_mids_gt.tolist(), valid_seqnames, num_seq=num_seq) vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type( torch.float32).permute(2, 0, 1) / 255 summary_writer.add_image("valid_scalemid_distribution/epoch", vis_image_scale_mid1, epoch) summary_writer.add_image("valid_scalemid_seq_distribution/epoch", vis_image_scale_mid2, epoch) summary_writer.close()
def main(cfg): if cfg.SEED_VALUE >= 0: print(f'Seed value for the experiment {cfg.SEED_VALUE}') os.environ['PYTHONHASHSEED'] = str(cfg.SEED_VALUE) random.seed(cfg.SEED_VALUE) torch.manual_seed(cfg.SEED_VALUE) np.random.seed(cfg.SEED_VALUE) logger = create_logger(cfg.LOGDIR, phase='train') logger.info(f'GPU name -> {torch.cuda.get_device_name()}') logger.info(f'GPU feat -> {torch.cuda.get_device_properties("cuda")}') logger.info(pprint.pformat(cfg)) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED writer = SummaryWriter(log_dir=cfg.LOGDIR) writer.add_text('config', pprint.pformat(cfg), 0) # ========= Dataloaders ========= # data_loaders = get_data_loaders(cfg) # ========= Compile Loss ========= # loss = TCMRLoss( e_loss_weight=cfg.LOSS.KP_2D_W, e_3d_loss_weight=cfg.LOSS.KP_3D_W, e_pose_loss_weight=cfg.LOSS.POSE_W, e_shape_loss_weight=cfg.LOSS.SHAPE_W, d_motion_loss_weight=cfg.LOSS.D_MOTION_LOSS_W, ) # ========= Initialize networks, optimizers and lr_schedulers ========= # generator = TCMR(n_layers=cfg.MODEL.TGRU.NUM_LAYERS, batch_size=cfg.TRAIN.BATCH_SIZE, seqlen=cfg.DATASET.SEQLEN, hidden_size=cfg.MODEL.TGRU.HIDDEN_SIZE, pretrained=cfg.TRAIN.PRETRAINED_REGRESSOR).to(cfg.DEVICE) gen_optimizer = get_optimizer( model=generator, optim_type=cfg.TRAIN.GEN_OPTIM, lr=cfg.TRAIN.GEN_LR, weight_decay=cfg.TRAIN.GEN_WD, momentum=cfg.TRAIN.GEN_MOMENTUM, ) motion_discriminator = MotionDiscriminator( rnn_size=cfg.TRAIN.MOT_DISCR.HIDDEN_SIZE, input_size=69, num_layers=cfg.TRAIN.MOT_DISCR.NUM_LAYERS, output_size=1, feature_pool=cfg.TRAIN.MOT_DISCR.FEATURE_POOL, attention_size=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL != 'attention' else cfg.TRAIN.MOT_DISCR.ATT.SIZE, attention_layers=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL != 'attention' else cfg.TRAIN.MOT_DISCR.ATT.LAYERS, attention_dropout=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL != 'attention' else cfg.TRAIN.MOT_DISCR.ATT.DROPOUT).to(cfg.DEVICE) dis_motion_optimizer = get_optimizer(model=motion_discriminator, optim_type=cfg.TRAIN.MOT_DISCR.OPTIM, lr=cfg.TRAIN.MOT_DISCR.LR, weight_decay=cfg.TRAIN.MOT_DISCR.WD, momentum=cfg.TRAIN.MOT_DISCR.MOMENTUM) motion_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( dis_motion_optimizer, mode='min', factor=0.1, patience=cfg.TRAIN.LR_PATIENCE, verbose=True, ) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( gen_optimizer, mode='min', factor=0.1, patience=cfg.TRAIN.LR_PATIENCE, verbose=True, ) # ========= Start Training ========= # Trainer( data_loaders=data_loaders, generator=generator, motion_discriminator=motion_discriminator, criterion=loss, dis_motion_optimizer=dis_motion_optimizer, dis_motion_update_steps=cfg.TRAIN.MOT_DISCR.UPDATE_STEPS, gen_optimizer=gen_optimizer, start_epoch=cfg.TRAIN.START_EPOCH, end_epoch=cfg.TRAIN.END_EPOCH, device=cfg.DEVICE, writer=writer, debug=cfg.DEBUG, logdir=cfg.LOGDIR, lr_scheduler=lr_scheduler, motion_lr_scheduler=motion_lr_scheduler, resume=cfg.TRAIN.RESUME, num_iters_per_epoch=cfg.TRAIN.NUM_ITERS_PER_EPOCH, debug_freq=cfg.DEBUG_FREQ, ).fit()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn.benchmark = config.CUDNN.BENCHMARK # cudnn.determinstic = config.CUDNN.DETERMINISTIC # cudnn.enabled = config.CUDNN.ENABLED # if isinstance(config.TRAIN.LR_STEP, list): # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, config.TRAIN.LR_STEP, # # config.TRAIN.LR_FACTOR, last_epoch-1 # config.TRAIN.LR_FACTOR, 0 # ) # else: # lr_scheduler = torch.optim.lr_scheduler.StepLR( # optimizer, config.TRAIN.LR_STEP, # # config.TRAIN.LR_FACTOR, last_epoch-1 # config.TRAIN.LR_FACTOR, 0 # ) dataset_type = get_dataset(config) train_dataset = dataset_type(config, is_train=True) # train_dataset[0] # return 0 train_loader = DataLoader( dataset=dataset_type(config, is_train=True), # batch_size=config.TRAIN.BATCH_SIZE_PER_GPU*len(gpus), batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS) # val_loader = DataLoader( # dataset=dataset_type(config, # is_train=True), # # batch_size=config.TEST.BATCH_SIZE_PER_GPU*len(gpus), # batch_size=config.TEST.BATCH_SIZE_PER_GPU, # shuffle=False, # num_workers=config.WORKERS, # # pin_memory=config.PIN_MEMORY # ) model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } gpus = list(config.GPUS) # model = nn.DataParallel(model, device_ids=gpus).cuda() model.to("cuda") # loss criterion = torch.nn.MSELoss(size_average=True).cuda() # criterion = fnn.mse_loss # criterion = WingLoss() # criterion = Loss_weighted() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.isfile(model_state_file): with open(model_state_file, "rb") as fp: state_dict = torch.load(fp) model.load_state_dict(state_dict) last_epoch = 1 # checkpoint = torch.load(model_state_file) # last_epoch = checkpoint['epoch'] # best_nme = checkpoint['best_nme'] # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})" .format(last_epoch)) else: print("=> no checkpoint found") lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluate nme = 0 # nme, predictions = function.validate(config, val_loader, model, # criterion, epoch, writer_dict) is_best = True # is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) torch.save(model.state_dict(), os.path.join(final_output_dir, 'mse_relu_checkpoint_{}.pth'.format(epoch))) # utils.save_checkpoint( # {"state_dict": model, # "epoch": epoch + 1, # "best_nme": best_nme, # "optimizer": optimizer.state_dict(), # }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() # set logger and dir logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.experiment_name, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # set cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # 目前仅支持单gpu,todo:增加多gpu支持 # set model and loss and criterion model = models.get_face_alignment_net(config) model = model.cuda(config.GPUS[0]) criterion = torch.nn.MSELoss(size_average=True).cuda(config.GPUS[0]) # criterion = AdaptiveWingLoss() optimizer = utils.get_optimizer(config, model) # get dataset dataset_type = get_dataset(config) # get dataloader train_loader = DataLoader(dataset=dataset_type(config, is_train=True), batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_loader = DataLoader(dataset=dataset_type(config, is_train=False), batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # set lr_scheduler last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) # set training writer writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # set training resume function if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") # starting training best_nme = 10000 for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() # traing function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluating nme, predictions = function.validate(config, val_loader, model, criterion, epoch, writer_dict) # saving is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): best_perf = 0.0 args = parse_args() reset_config(config, args) logger, final_output_dir = create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = models.pose3d_resnet.get_pose_net(config, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2(args.cfg, final_output_dir) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # define loss function (criterion) and optimizer loss_fn = eval('loss.' + config.LOSS.FN) criterion = loss_fn(num_joints=config.MODEL.NUM_JOINTS, norm=config.LOSS.NORM).cuda() # define training, validation and evaluation routines train = train_integral validate = validate_integral evaluate = eval_integral optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Resume from a trained model if not (config.MODEL.RESUME is ''): checkpoint = torch.load(config.MODEL.RESUME) if 'epoch' in checkpoint.keys(): config.TRAIN.BEGIN_EPOCH = checkpoint['epoch'] best_perf = checkpoint['perf'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('=> resume from pretrained model {}'.format( config.MODEL.RESUME)) else: model.load_state_dict(checkpoint) logger.info('=> resume from pretrained model {}'.format( config.MODEL.RESUME)) # Choose the dataset, either Human3.6M or mpii ds = eval('dataset.' + config.DATASET.DATASET) # Data loading code train_dataset = ds(cfg=config, root=config.DATASET.ROOT, image_set=config.DATASET.TRAIN_SET, is_train=True) valid_dataset = ds(cfg=config, root=config.DATASET.ROOT, image_set=config.DATASET.TEST_SET, is_train=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set preds_in_patch_with_score = validate(valid_loader, model) acc = evaluate(epoch, preds_in_patch_with_score, valid_loader, final_output_dir, debug=config.DEBUG.DEBUG) perf_indicator = 500. - acc if config.DATASET.DATASET == 'h36m' or 'mpii_3dhp' or 'jta' else acc if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file)
def main(): # load config config = parse_arg() # create output folder output_dict = utils.create_log_folder(config, phase='train') # cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # writer dict writer_dict = { 'writer': SummaryWriter(log_dir=output_dict['tb_dir']), 'train_global_steps': 0, 'valid_global_steps': 0, } # construct face related neural networks model = crnn.get_crnn(config) # get device if torch.cuda.is_available(): device = torch.device("cuda:{}".format(config.GPUID)) else: device = torch.device("cpu:0") model = model.to(device) # define loss function criterion = torch.nn.CTCLoss() optimizer = utils.get_optimizer(config, model) last_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME.IS_RESUME: model_state_file = config.TRAIN.RESUME.FILE if model_state_file == '': print(" => no checkpoint found") checkpoint = torch.load(model_state_file, map_location='cpu') model.load_state_dict(checkpoint['state_dict']) last_epoch = checkpoint['epoch'] if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch-1 ) else: lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1 ) train_dataset = get_dataset(config)(config, is_train=True) train_loader = DataLoader( dataset=train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) val_dataset = get_dataset(config)(config, is_train=False) val_loader = DataLoader( dataset=val_dataset, batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=config.TEST.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) best_acc = 0.5 converter = utils.strLabelConverter(config.DATASET.ALPHABETS) for epoch in range(last_epoch, config.TRAIN.END_EPOCH): function.train(config, train_loader, train_dataset, converter, model, criterion, optimizer, device, epoch, writer_dict, output_dict) lr_scheduler.step() acc = function.validate(config, val_loader, val_dataset, converter, model, criterion, device, epoch, writer_dict, output_dict) is_best = acc > best_acc best_acc = max(acc, best_acc) print("is best:", is_best) print("best acc is:", best_acc) # save checkpoint torch.save( { "state_dict": model.state_dict(), "epoch": epoch + 1, "best_acc": best_acc, }, os.path.join(output_dict['chs_dir'], "checkpoint_{}_acc_{:.4f}.pth".format(epoch, acc)) ) writer_dict['writer'].close()
def main(): # set all the configurations args = parse_args() update_config(cfg, args) # set the logger, tb_log_dir means tensorboard logdir logger, final_output_dir, tb_log_dir = create_logger( cfg, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(cfg) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED # bulid up model model = get_net(cfg) model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda() # Data loading normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=cfg.TRAIN.SHUFFLE, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) # define loss function (criterion) and optimizer criterion = get_loss(cfg).cuda() optimizer = get_optimizer(cfg, model) # load checkpoint model best_perf = 0.0 best_model = False last_epoch = -1 lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) # training for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, writer_dict) lr_scheduler.step() # evaluate on validation set if epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH + 1: perf_indicator = validate(cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False # save checkpoint model and best model logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # save final model final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(cfg): if cfg.SEED_VALUE >= 0: print(f'Seed value for the experiment {cfg.SEED_VALUE}') os.environ['PYTHONHASHSEED'] = str(cfg.SEED_VALUE) random.seed(cfg.SEED_VALUE) torch.manual_seed(cfg.SEED_VALUE) np.random.seed(cfg.SEED_VALUE) logger = create_logger(cfg.LOGDIR, phase='train') logger.info(f'GPU name -> {torch.cuda.get_device_name()}') logger.info(f'GPU feat -> {torch.cuda.get_device_properties("cuda")}') logger.info(pprint.pformat(cfg)) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED writer = SummaryWriter(log_dir=cfg.LOGDIR) writer.add_text('config', pprint.pformat(cfg), 0) # ========= Dataloaders ========= # data_loaders = get_data_loaders(cfg) # ========= Compile Loss ========= # loss = VIBELoss( e_loss_weight=cfg.LOSS.KP_2D_W, e_3d_loss_weight=cfg.LOSS.KP_3D_W, e_pose_loss_weight=cfg.LOSS.POSE_W, e_shape_loss_weight=cfg.LOSS.SHAPE_W, d_motion_loss_weight=cfg.LOSS.D_MOTION_LOSS_W, ) # ========= Initialize networks, optimizers and lr_schedulers ========= # vibe = VIBE( n_layers=cfg.MODEL.TGRU.NUM_LAYERS, batch_size=cfg.TRAIN.BATCH_SIZE, seqlen=cfg.DATASET.SEQLEN, hidden_size=cfg.MODEL.TGRU.HIDDEN_SIZE, pretrained=cfg.TRAIN.PRETRAINED_REGRESSOR, add_linear=cfg.MODEL.TGRU.ADD_LINEAR, bidirectional=cfg.MODEL.TGRU.BIDIRECTIONAL, use_residual=cfg.MODEL.TGRU.RESIDUAL, ).to(cfg.DEVICE) if cfg.TRAIN.PRETRAINED != '' and os.path.isfile(cfg.TRAIN.PRETRAINED): checkpoint = torch.load(cfg.TRAIN.PRETRAINED) best_performance = checkpoint['performance'] vibe.load_state_dict(checkpoint['gen_state_dict']) print(f'==> Loaded pretrained model from {cfg.TRAIN.PRETRAINED}...') print(f'Performance on 3DPW test set {best_performance}') else: print(f'{cfg.TRAIN.PRETRAINED} is not a pretrained model!!!!') generator = REFINER(vibe=vibe).to(cfg.DEVICE) gen_optimizer = get_optimizer( model=generator, optim_type=cfg.TRAIN.GEN_OPTIM, lr=cfg.TRAIN.GEN_LR, weight_decay=cfg.TRAIN.GEN_WD, momentum=cfg.TRAIN.GEN_MOMENTUM, ) # motion_discriminator = MotionDiscriminator( # rnn_size=cfg.TRAIN.MOT_DISCR.HIDDEN_SIZE, # input_size=69, # num_layers=cfg.TRAIN.MOT_DISCR.NUM_LAYERS, # output_size=1, # feature_pool=cfg.TRAIN.MOT_DISCR.FEATURE_POOL, # attention_size=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL !='attention' else cfg.TRAIN.MOT_DISCR.ATT.SIZE, # attention_layers=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL !='attention' else cfg.TRAIN.MOT_DISCR.ATT.LAYERS, # attention_dropout=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL !='attention' else cfg.TRAIN.MOT_DISCR.ATT.DROPOUT # ).to(cfg.DEVICE) # dis_motion_optimizer = get_optimizer( # model=motion_discriminator, # optim_type=cfg.TRAIN.MOT_DISCR.OPTIM, # lr=cfg.TRAIN.MOT_DISCR.LR, # weight_decay=cfg.TRAIN.MOT_DISCR.WD, # momentum=cfg.TRAIN.MOT_DISCR.MOMENTUM # ) # motion_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( # dis_motion_optimizer, # mode='min', # factor=0.1, # patience=cfg.TRAIN.LR_PATIENCE, # verbose=True, # ) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( gen_optimizer, mode='min', factor=0.1, patience=cfg.TRAIN.LR_PATIENCE, verbose=True, ) # ========= Start Training ========= # motion_discriminator = None dis_motion_optimizer = None motion_lr_scheduler = None Trainer( data_loaders=data_loaders, generator=generator, motion_discriminator=motion_discriminator, criterion=loss, dis_motion_optimizer=dis_motion_optimizer, dis_motion_update_steps=cfg.TRAIN.MOT_DISCR.UPDATE_STEPS, gen_optimizer=gen_optimizer, start_epoch=cfg.TRAIN.START_EPOCH, end_epoch=cfg.TRAIN.END_EPOCH, device=cfg.DEVICE, writer=writer, debug=cfg.DEBUG, logdir=cfg.LOGDIR, lr_scheduler=lr_scheduler, motion_lr_scheduler=motion_lr_scheduler, resume=cfg.TRAIN.RESUME, num_iters_per_epoch=cfg.TRAIN.NUM_ITERS_PER_EPOCH, debug_freq=cfg.DEBUG_FREQ, ).fit()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) dataset_type = get_dataset(config) train_data = dataset_type(config, split="train") train_loader = DataLoader(dataset=train_data, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_data = dataset_type(config, split="valid") val_loader = DataLoader(dataset=val_data, batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # config.MODEL.NUM_JOINTS = train_data.get_num_points() model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } model = nn.DataParallel(model, device_ids=gpus).cuda() # loss criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'final.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") loss = [] for epoch in range(last_epoch, config.TRAIN.END_EPOCH): losses, diff = function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) loss.append(losses) lr_scheduler.step() np.save( os.path.join(final_output_dir, "train_diff@epoch{}".format(epoch)), diff) # evaluate nme, predictions, diff = function.validate(config, val_loader, model, criterion, epoch, writer_dict) np.save( os.path.join(final_output_dir, "valid_diff@epoch{}".format(epoch)), diff) is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) if is_best: for i in range(len(predictions)): afile = val_data.annotation_files[i] new_afile = '{}.{}.txt'.format( afile, os.path.basename(args.cfg).split('.')[0]) with open(new_afile, 'wt') as f: pts = predictions[i].cpu().numpy() for j in range(len(pts)): f.write("{},{}\n".format( pts[j][1] / val_data.factor[1], pts[j][0] / val_data.factor[0])) pd.DataFrame(data=loss).to_csv('loss2.csv') final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()