def main(): parser = argparse.ArgumentParser(description='Train keypoints network') # general parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str) args, rest = parser.parse_known_args() # init config config = Configuration(args.cfg) logger = create_logger(config) # TODO: task1: finish create_data_loader train_loader, train_set = create_data_loader(config, training=True) test_loader = create_data_loader(config, training=False) model = create_model(config) # TODO: load pre-trained model provided by PyTorch model.load(config.MODEL.PRETRAINED, is_pretrained=True) loss_fn = create_loss_fn(config) optimizer, scheduler = create_optim_and_scheduler(config, model) for i in range(config.TRAIN.N_DAYS): if i == 0: train(config, train_loader, test_loader, model, loss_fn, optimizer, scheduler, logger, day=i) else: new_data = None # TODO: set up new data by yourself. # TODO: task 2: finish select new data for online-learning new_data_selected = online_data_selection(model, new_data, config.TRAIN.CONFIDENCE) train_loader, _ = create_data_loader(config, dataset=train_set, extra_data=new_data_selected, training=True) train(config, train_loader, test_loader, model, loss_fn, optimizer, scheduler, logger, day=i)
def main(): # load config config = parse_arg() # create output folder output_dict = utils.create_log_folder(config, phase='train') # cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # writer dict writer_dict = { 'writer': SummaryWriter(log_dir=output_dict['tb_dir']), 'train_global_steps': 0, 'valid_global_steps': 0, } # construct face related neural networks model = crnn.get_crnn(config) # get device if torch.cuda.is_available(): device = torch.device("cuda:{}".format(config.GPUID)) else: device = torch.device("cpu:0") model = model.to(device) # define loss function criterion = torch.nn.CTCLoss() optimizer = utils.get_optimizer(config, model) last_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME.IS_RESUME: model_state_file = config.TRAIN.RESUME.FILE if model_state_file == '': print(" => no checkpoint found") checkpoint = torch.load(model_state_file, map_location='cpu') model.load_state_dict(checkpoint['state_dict']) last_epoch = checkpoint['epoch'] if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch-1 ) else: lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1 ) train_dataset = get_dataset(config)(config, is_train=True) train_loader = DataLoader( dataset=train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) val_dataset = get_dataset(config)(config, is_train=False) val_loader = DataLoader( dataset=val_dataset, batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=config.TEST.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) best_acc = 0.5 converter = utils.strLabelConverter(config.DATASET.ALPHABETS) for epoch in range(last_epoch, config.TRAIN.END_EPOCH): function.train(config, train_loader, train_dataset, converter, model, criterion, optimizer, device, epoch, writer_dict, output_dict) lr_scheduler.step() acc = function.validate(config, val_loader, val_dataset, converter, model, criterion, device, epoch, writer_dict, output_dict) is_best = acc > best_acc best_acc = max(acc, best_acc) print("is best:", is_best) print("best acc is:", best_acc) # save checkpoint torch.save( { "state_dict": model.state_dict(), "epoch": epoch + 1, "best_acc": best_acc, }, os.path.join(output_dict['chs_dir'], "checkpoint_{}_acc_{:.4f}.pth".format(epoch, acc)) ) writer_dict['writer'].close()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) dataset_type = get_dataset(config) train_data = dataset_type(config, is_train=True) train_loader = DataLoader(dataset=train_data, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_loader = DataLoader(dataset=dataset_type(config, is_train=False), batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # config.MODEL.NUM_JOINTS = train_data.get_num_points() model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } model = nn.DataParallel(model, device_ids=gpus).cuda() # loss criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluate nme, predictions = function.validate(config, val_loader, model, criterion, epoch, writer_dict) is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.'+config.MODEL.NAME+'.get_seg_net')( config, is_train=True ) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, 'vis_global_steps': 0, } # dump_input = torch.rand((config.TRAIN.BATCH_SIZE, # 3, # config.MODEL.IMAGE_SIZE[1], # config.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() optimizer = get_optimizer(config, model) # Data loading code if 'xception' in config.MODEL.NAME: # Xception uses different mean std for input image normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_augs = aug.Compose([aug.RandomScale(0.5, 2.0), aug.RandomHorizontallyFlip(0.5), aug.RandomSizedCrop(config.MODEL.IMAGE_SIZE)]) test_augs = None train_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), augmentations=train_augs ) valid_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, transform=transforms.Compose([ transforms.ToTensor(), normalize, ]), augmentations=test_augs ) # define loss function (criterion) and optimizer criterion = CrossEntropy2D(ignore_index=255, weight=train_dataset.class_weights).cuda() train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True, drop_last=True if len(gpus) > 2 else False # PyTorch's DataParallel model cannot handle 0 image on either of the GPUs ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True ) if config.TRAIN.LR_SCHEDULER == 'multistep': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR ) elif config.TRAIN.LR_SCHEDULER == 'poly': max_iter = config.TRAIN.END_EPOCH * len(train_loader) lr_scheduler = PolynomialLR(optimizer, max_iter=max_iter, decay_iter=1) elif config.TRAIN.LR_SCHEDULER == 'none': lr_scheduler = None else: raise ValueError('Scheduler {} not supported'.format(config.TRAIN.LR_SCHEDULER)) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): if config.TRAIN.LR_SCHEDULER == 'multistep': lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, lr_scheduler, epoch, final_output_dir, tb_log_dir, writer_dict) if (epoch + 1) % config.TRAIN.EVAL_INTERVAL == 0: if not config.MODEL.LEARN_GAMMA: if float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) <= 1: gamma = (config.TRAIN.NE_GAMMA_U - config.TRAIN.NE_GAMMA_L) * \ (1 - float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) ) ** \ config.TRAIN.NE_GAMMA_EXP + config.TRAIN.NE_GAMMA_L else: gamma = config.TRAIN.NE_GAMMA_L else: gamma = None # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict, gamma=gamma) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) else: perf_indicator = 0.0 final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.gpu) reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('{}.get_pose_net'.format(args.model))(config, is_train=True) model.eval() params = count_parameters_in_MB(model) logger.info("Params = %.2fMB" % params) mult_adds = comp_multadds(model, input_size=(3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) logger.info("Mult-Adds = %.2fMB" % mult_adds) model.train() model = model.cuda() # copy model file # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir) # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) lr_scheduler.step()
def main(): # load config config = parse_arg() # create output folder output_dict = utils.create_log_folder(config, phase='train') # cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # writer dict writer_dict = { 'writer': SummaryWriter(log_dir=output_dict['tb_dir']), 'train_global_steps': 0, 'valid_global_steps': 0, } # construct face related neural networks model = crnn.get_crnn(config) # # checkpoint = torch.load('/data/yolov5/CRNN_Chinese_Characters_Rec/output/OWN/crnn/2020-09-15-22-13/checkpoints/checkpoint_98_acc_1.0983.pth') # if 'state_dict' in checkpoint.keys(): # model.load_state_dict(checkpoint['state_dict']) # else: # model.load_state_dict(checkpoint) # get device if torch.cuda.is_available(): device = torch.device("cuda:{}".format(config.GPUID)) else: device = torch.device("cpu:0") model = model.to(device) # define loss function # criterion = torch.nn.CTCLoss() criterion = CTCLoss() last_epoch = config.TRAIN.BEGIN_EPOCH optimizer = utils.get_optimizer(config, model) if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.FINETUNE.IS_FINETUNE: model_state_file = config.TRAIN.FINETUNE.FINETUNE_CHECKPOINIT if model_state_file == '': print(" => no checkpoint found") checkpoint = torch.load(model_state_file, map_location='cpu') if 'state_dict' in checkpoint.keys(): checkpoint = checkpoint['state_dict'] from collections import OrderedDict model_dict = OrderedDict() for k, v in checkpoint.items(): if 'cnn' in k: model_dict[k[4:]] = v model.cnn.load_state_dict(model_dict) if config.TRAIN.FINETUNE.FREEZE: for p in model.cnn.parameters(): p.requires_grad = False elif config.TRAIN.RESUME.IS_RESUME: model_state_file = config.TRAIN.RESUME.FILE if model_state_file == '': print(" => no checkpoint found") checkpoint = torch.load(model_state_file, map_location='cpu') if 'state_dict' in checkpoint.keys(): model.load_state_dict(checkpoint['state_dict']) last_epoch = checkpoint['epoch'] # optimizer.load_state_dict(checkpoint['optimizer']) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) else: model.load_state_dict(checkpoint) model_info(model) train_dataset = get_dataset(config)(config, is_train=True) train_loader = DataLoader( dataset=train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) val_dataset = get_dataset(config)(config, is_train=False) val_loader = DataLoader( dataset=val_dataset, batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=config.TEST.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY, ) best_acc = 0.5 converter = utils.strLabelConverter(config.DATASET.ALPHABETS) for epoch in range(last_epoch, config.TRAIN.END_EPOCH): function.train(config, train_loader, train_dataset, converter, model, criterion, optimizer, device, epoch, writer_dict, output_dict) lr_scheduler.step() acc = function.validate(config, val_loader, val_dataset, converter, model, criterion, device, epoch, writer_dict, output_dict) is_best = acc > best_acc best_acc = max(acc, best_acc) print("is best:", is_best) print("best acc is:", best_acc) # save checkpoint torch.save( { "state_dict": model.state_dict(), "epoch": epoch + 1, # "optimizer": optimizer.state_dict(), # "lr_scheduler": lr_scheduler.state_dict(), "best_acc": best_acc, }, os.path.join(output_dict['chs_dir'], "checkpoint_{}_acc_{:.4f}.pth".format(epoch, acc))) writer_dict['writer'].close()
def main(): args = parse_args() update_config(cfg, args) logger, final_output_dir, tb_log_dir = create_logger( cfg, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(cfg) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir) # logger.info(pprint.pformat(model)) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input)) logger.info(get_model_summary(model, dump_input)) model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda() # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True, transforms.Compose([transforms.ToTensor(), normalize])) valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False, transforms.Compose([transforms.ToTensor(), normalize])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=cfg.TRAIN.SHUFFLE, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) best_perf = 0.0 best_model = False last_epoch = -1 optimizer = get_optimizer(cfg, model) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) lr_scheduler.step() # evaluate on validation set perf_indicator = validate(cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): # set all the configurations args = parse_args() update_config(cfg, args) # set the logger, tb_log_dir means tensorboard logdir logger, final_output_dir, tb_log_dir = create_logger( cfg, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(cfg) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED # bulid up model model = get_net(cfg) model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda() # Data loading normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)( cfg, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=cfg.TRAIN.SHUFFLE, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY) # define loss function (criterion) and optimizer criterion = get_loss(cfg).cuda() optimizer = get_optimizer(cfg, model) # load checkpoint model best_perf = 0.0 best_model = False last_epoch = -1 lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) # training for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, writer_dict) lr_scheduler.step() # evaluate on validation set if epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH + 1: perf_indicator = validate(cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False # save checkpoint model and best model logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # save final model final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): final_output_dir = 'output' args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger(config, 'train') logger.info(pprint.pformat(config)) # CuDNN cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # HRNet Model mv_hrnet = get_pose_net(config, is_train=True) #pose_hrnet = get_pose_net(config, is_train=True) # Pose estimation model #pose_hrnet.load_state_dict(torch.load(config.NETWORK.PRETRAINED), strict=False) # Pretrained weight loading #mv_hrnet = get_multiview_pose_net(pose_hrnet, config) # Multiview adopting #depth_hrnet = get_pose_net(config, is_train=True) # 2.5D depth prediction model # Multi GPUs Setting gpus = [int(i) for i in config.GPUS.split(',')] mv_hrnet = torch.nn.DataParallel(mv_hrnet, device_ids=gpus).cuda() logger.info('=> init data parallel model') # Loss criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() logger.info('=> init criterion') # Optimizer optimizer = get_optimizer(config, mv_hrnet) logger.info('=> init {} optimizer'.format(config.TRAIN.OPTIMIZER)) # Loading checkpoint start_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME: start_epoch, mv_hrnet, optimizer = load_checkpoint( mv_hrnet, optimizer, final_output_dir) # Scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) logger.info('=> init scheduler') # Summary writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # Data loader normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) logger.info('=> loading train dataset') train_dataset = H36MDataset( config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize])) #train_dataset = MultiViewH36M(config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize])) logger.info('=> loading validation dataset') valid_dataset = H36MDataset( config, config.DATASET.TEST_SUBSET, False, transforms.Compose([transforms.ToTensor(), normalize])) logger.info('=> loading train dataloader') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) logger.info('=> loading valid dataloader') valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) # Training loop best_perf = 0.0 best_model = False for epoch in range(start_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() # Trainer train(config, train_loader, mv_hrnet, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # Performance indicator perf_indicator = validate(config, valid_loader, valid_dataset, mv_hrnet, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': mv_hrnet.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # End final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(mv_hrnet.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn.benchmark = config.CUDNN.BENCHMARK # cudnn.determinstic = config.CUDNN.DETERMINISTIC # cudnn.enabled = config.CUDNN.ENABLED # if isinstance(config.TRAIN.LR_STEP, list): # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, config.TRAIN.LR_STEP, # # config.TRAIN.LR_FACTOR, last_epoch-1 # config.TRAIN.LR_FACTOR, 0 # ) # else: # lr_scheduler = torch.optim.lr_scheduler.StepLR( # optimizer, config.TRAIN.LR_STEP, # # config.TRAIN.LR_FACTOR, last_epoch-1 # config.TRAIN.LR_FACTOR, 0 # ) dataset_type = get_dataset(config) train_dataset = dataset_type(config, is_train=True) # train_dataset[0] # return 0 train_loader = DataLoader( dataset=dataset_type(config, is_train=True), # batch_size=config.TRAIN.BATCH_SIZE_PER_GPU*len(gpus), batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS) # val_loader = DataLoader( # dataset=dataset_type(config, # is_train=True), # # batch_size=config.TEST.BATCH_SIZE_PER_GPU*len(gpus), # batch_size=config.TEST.BATCH_SIZE_PER_GPU, # shuffle=False, # num_workers=config.WORKERS, # # pin_memory=config.PIN_MEMORY # ) model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } gpus = list(config.GPUS) # model = nn.DataParallel(model, device_ids=gpus).cuda() model.to("cuda") # loss criterion = torch.nn.MSELoss(size_average=True).cuda() # criterion = fnn.mse_loss # criterion = WingLoss() # criterion = Loss_weighted() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.isfile(model_state_file): with open(model_state_file, "rb") as fp: state_dict = torch.load(fp) model.load_state_dict(state_dict) last_epoch = 1 # checkpoint = torch.load(model_state_file) # last_epoch = checkpoint['epoch'] # best_nme = checkpoint['best_nme'] # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})" .format(last_epoch)) else: print("=> no checkpoint found") lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluate nme = 0 # nme, predictions = function.validate(config, val_loader, model, # criterion, epoch, writer_dict) is_best = True # is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) torch.save(model.state_dict(), os.path.join(final_output_dir, 'mse_relu_checkpoint_{}.pth'.format(epoch))) # utils.save_checkpoint( # {"state_dict": model, # "epoch": epoch + 1, # "best_nme": best_nme, # "optimizer": optimizer.state_dict(), # }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(config) writer_dict = { 'writer': SummaryWriter(tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # cudnn related setting #cudnn.benchmark = config.CUDNN.BENCHMARK #cudnn.deterministic = config.CUDNN.DETERMINISTIC #cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) # build model model = get_seg_model(config) dump_input = torch.rand( (1, 3, config.TRAIN.IMAGE_SIZE[1], config.TRAIN.IMAGE_SIZE[0])) logger.info(get_model_summary(model.cuda(), dump_input.cuda())) # copy model file this_dir = os.path.dirname(__file__) models_dst_dir = os.path.join(final_output_dir, 'models') if os.path.exists(models_dst_dir): shutil.rmtree(models_dst_dir) shutil.copytree(os.path.join(this_dir, '../lib/models'), models_dst_dir) # prepare data crop_size = (config.TRAIN.IMAGE_SIZE[1], config.TRAIN.IMAGE_SIZE[0]) trfm = A.Compose([ A.Resize(crop_size[0], crop_size[1]), A.HorizontalFlip(p=0.5), A.VerticalFlip(p=0.5), A.OneOf([ A.RandomGamma(), A.RandomBrightnessContrast(), A.ColorJitter(brightness=0.07, contrast=0.07, saturation=0.1, hue=0.1, always_apply=False, p=0.3), ], p=0.3), A.OneOf([ A.ElasticTransform( alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), A.GridDistortion(), A.OpticalDistortion(distort_limit=2, shift_limit=0.5), ], p=0.0), A.ShiftScaleRotate(), ]) ds = HubDataset(config.DATA_PATH, window=config.WINDOW, overlap=config.MIN_OVERLAP, transform=trfm) valid_idx, train_idx = [], [] for i in range(len(ds)): if ds.slices[i][0] == 7: valid_idx.append(i) else: train_idx.append(i) train_dataset = torch.utils.data.Subset(ds, train_idx) test_dataset = torch.utils.data.Subset(ds, valid_idx) # define training and validation data loaders trainloader = DataLoader(train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=True, num_workers=config.WORKERS, drop_last=True) testloader = DataLoader(test_dataset, batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, drop_last=False) """ train_dataset = eval('datasets.'+config.DATASET.DATASET)( root=config.DATASET.ROOT, list_path=config.DATASET.TRAIN_SET, num_samples=None, num_classes=config.DATASET.NUM_CLASSES, multi_scale=config.TRAIN.MULTI_SCALE, flip=config.TRAIN.FLIP, ignore_label=config.TRAIN.IGNORE_LABEL, base_size=config.TRAIN.BASE_SIZE, crop_size=crop_size, downsample_rate=config.TRAIN.DOWNSAMPLERATE, scale_factor=config.TRAIN.SCALE_FACTOR) trainloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True, drop_last=True) if config.DATASET.EXTRA_TRAIN_SET: extra_train_dataset = eval('datasets.'+config.DATASET.DATASET)( root=config.DATASET.ROOT, list_path=config.DATASET.EXTRA_TRAIN_SET, num_samples=None, num_classes=config.DATASET.NUM_CLASSES, multi_scale=config.TRAIN.MULTI_SCALE, flip=config.TRAIN.FLIP, ignore_label=config.TRAIN.IGNORE_LABEL, base_size=config.TRAIN.BASE_SIZE, crop_size=crop_size, downsample_rate=config.TRAIN.DOWNSAMPLERATE, scale_factor=config.TRAIN.SCALE_FACTOR) extra_trainloader = torch.utils.data.DataLoader( extra_train_dataset, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True, drop_last=True) test_size = (config.TEST.IMAGE_SIZE[1], config.TEST.IMAGE_SIZE[0]) test_dataset = eval('datasets.'+config.DATASET.DATASET)( root=config.DATASET.ROOT, list_path=config.DATASET.TEST_SET, num_samples=config.TEST.NUM_SAMPLES, num_classes=config.DATASET.NUM_CLASSES, multi_scale=False, flip=False, ignore_label=config.TRAIN.IGNORE_LABEL, base_size=config.TEST.BASE_SIZE, crop_size=test_size, downsample_rate=1) testloader = torch.utils.data.DataLoader( test_dataset, batch_size=config.TEST.BATCH_SIZE_PER_GPU*len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) """ # criterion if config.LOSS.USE_OHEM: criterion = OhemCrossEntropy(ignore_label=config.TRAIN.IGNORE_LABEL, thres=config.LOSS.OHEMTHRES, min_kept=config.LOSS.OHEMKEEP, weight=train_dataset.class_weights) else: """ criterion = CrossEntropy(ignore_label=config.TRAIN.IGNORE_LABEL, weight=train_dataset.class_weights) """ criterion = list() criterion.append(nn.BCEWithLogitsLoss()) criterion.append(SoftDiceLoss()) model = FullModel(model, criterion).cuda() #model = nn.DataParallel(model, device_ids=gpus).cuda() # optimizer if config.TRAIN.OPTIMIZER == 'sgd': optimizer = torch.optim.SGD( [{ 'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': config.TRAIN.LR }], lr=config.TRAIN.LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WD, nesterov=config.TRAIN.NESTEROV, ) else: raise ValueError('Only Support SGD optimizer') epoch_iters = np.int(train_dataset.__len__() / config.TRAIN.BATCH_SIZE_PER_GPU / len(gpus)) best_mIoU = 0 last_epoch = 0 if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'checkpoint.pth.tar') if os.path.isfile(model_state_file): checkpoint = torch.load(model_state_file) best_mIoU = checkpoint['best_mIoU'] last_epoch = checkpoint['epoch'] model.module.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) start = timeit.default_timer() end_epoch = config.TRAIN.END_EPOCH + config.TRAIN.EXTRA_EPOCH num_iters = config.TRAIN.END_EPOCH * epoch_iters extra_iters = config.TRAIN.EXTRA_EPOCH * epoch_iters for epoch in range(last_epoch, end_epoch): if epoch >= config.TRAIN.END_EPOCH: train(config, epoch - config.TRAIN.END_EPOCH, config.TRAIN.EXTRA_EPOCH, epoch_iters, config.TRAIN.EXTRA_LR, extra_iters, extra_trainloader, optimizer, model, writer_dict) else: train(config, epoch, config.TRAIN.END_EPOCH, epoch_iters, config.TRAIN.LR, num_iters, trainloader, optimizer, model, writer_dict) logger.info('=> saving checkpoint to {}'.format(final_output_dir + 'checkpoint.pth.tar')) torch.save( { 'epoch': epoch + 1, 'best_mIoU': best_mIoU, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(final_output_dir, 'checkpoint.pth.tar')) valid_loss, mean_IoU, IoU_array = validate(config, testloader, model, writer_dict) if mean_IoU > best_mIoU: best_mIoU = mean_IoU torch.save(model.module.state_dict(), os.path.join(final_output_dir, 'best.pth')) msg = 'Loss: {:.3f}, MeanIU: {: 4.4f}, Best_mIoU: {: 4.4f}'.format( valid_loss, mean_IoU, best_mIoU) logging.info(msg) logging.info(IoU_array) torch.save(model.module.state_dict(), os.path.join(final_output_dir, 'final_state.pth')) writer_dict['writer'].close() end = timeit.default_timer() logger.info('Hours: %d' % np.int((end - start) / 3600)) logger.info('Done')
def main(): args = parse_args() # set logger and dir logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.experiment_name, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # set cudnn cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED # 目前仅支持单gpu,todo:增加多gpu支持 # set model and loss and criterion model = models.get_face_alignment_net(config) model = model.cuda(config.GPUS[0]) criterion = torch.nn.MSELoss(size_average=True).cuda(config.GPUS[0]) # criterion = AdaptiveWingLoss() optimizer = utils.get_optimizer(config, model) # get dataset dataset_type = get_dataset(config) # get dataloader train_loader = DataLoader(dataset=dataset_type(config, is_train=True), batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_loader = DataLoader(dataset=dataset_type(config, is_train=False), batch_size=config.TEST.BATCH_SIZE_PER_GPU, shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # set lr_scheduler last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) # set training writer writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # set training resume function if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'latest.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") # starting training best_nme = 10000 for epoch in range(last_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() # traing function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) # evaluating nme, predictions = function.validate(config, val_loader, model, criterion, epoch, writer_dict) # saving is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() logger, final_output_dir, tb_log_dir = \ utils.create_logger(config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.determinstic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.GPUS) dataset_type = get_dataset(config) train_data = dataset_type(config, split="train") train_loader = DataLoader(dataset=train_data, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) val_data = dataset_type(config, split="valid") val_loader = DataLoader(dataset=val_data, batch_size=config.TEST.BATCH_SIZE_PER_GPU * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=config.PIN_MEMORY) # config.MODEL.NUM_JOINTS = train_data.get_num_points() model = models.get_face_alignment_net(config) # copy model files writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } model = nn.DataParallel(model, device_ids=gpus).cuda() # loss criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = utils.get_optimizer(config, model) best_nme = 100 last_epoch = config.TRAIN.BEGIN_EPOCH if isinstance(config.TRAIN.LR_STEP, list): lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) else: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch - 1) if config.TRAIN.RESUME: model_state_file = os.path.join(final_output_dir, 'final.pth') if os.path.islink(model_state_file): checkpoint = torch.load(model_state_file) last_epoch = checkpoint['epoch'] best_nme = checkpoint['best_nme'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) else: print("=> no checkpoint found") loss = [] for epoch in range(last_epoch, config.TRAIN.END_EPOCH): losses, diff = function.train(config, train_loader, model, criterion, optimizer, epoch, writer_dict) loss.append(losses) lr_scheduler.step() np.save( os.path.join(final_output_dir, "train_diff@epoch{}".format(epoch)), diff) # evaluate nme, predictions, diff = function.validate(config, val_loader, model, criterion, epoch, writer_dict) np.save( os.path.join(final_output_dir, "valid_diff@epoch{}".format(epoch)), diff) is_best = nme < best_nme best_nme = min(nme, best_nme) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) print("best:", is_best) utils.save_checkpoint( { "state_dict": model, "epoch": epoch + 1, "best_nme": best_nme, "optimizer": optimizer.state_dict(), }, predictions, is_best, final_output_dir, 'checkpoint_{}.pth'.format(epoch)) if is_best: for i in range(len(predictions)): afile = val_data.annotation_files[i] new_afile = '{}.{}.txt'.format( afile, os.path.basename(args.cfg).split('.')[0]) with open(new_afile, 'wt') as f: pts = predictions[i].cpu().numpy() for j in range(len(pts)): f.write("{},{}\n".format( pts[j][1] / val_data.factor[1], pts[j][0] / val_data.factor[0])) pd.DataFrame(data=loss).to_csv('loss2.csv') final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()