def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.' + config.MODEL.NAME + '.get_pose_net')( config, is_train=True) if args.model: logger.info('=> loading model from {}'.format(args.model)) checkpoint = torch.load(args.model) #from collections import OrderedDict #new_state_dict = OrderedDict() #for k,v in checkpoint.items(): # k='haha'+k # new_state_dict[k]=v #model.load_state_dict(new_state_dict) model.load_state_dict(checkpoint) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) print(len(train_loader)) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) print(len(valid_loader)) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, data_flag='posetrack') # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('Now epoch is {}, now best model is {}'.format( epoch + 1, best_model)) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') # print code version info repo = Repo('') repo_git = repo.git working_tree_diff_head = repo_git.diff('HEAD') this_commit_hash = repo.commit() cur_branches = repo_git.branch('--list') logger.info('Current Code Version is {}'.format(this_commit_hash)) logger.info('Current Branch Info :\n{}'.format(cur_branches)) logger.info( 'Working Tree diff with HEAD: \n{}'.format(working_tree_diff_head)) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED backbone_model = eval('models.' + config.BACKBONE_MODEL + '.get_pose_net')( config, is_train=True) model = models.multiview_pose_net.get_multiview_pose_net( backbone_model, config) # logger.info(pprint.pformat(model)) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # dump_input = torch.rand( # (config.TRAIN.BATCH_SIZE, 3, # config.NETWORK.NUM_JOINTS, # config.NETWORK.IMAGE_SIZE[1], config.NETWORK.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, dump_input) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() # criterion_fuse = JointsMSELoss(use_target_weight=True).cuda() optimizer = get_optimizer(config, model) start_epoch = config.TRAIN.BEGIN_EPOCH if config.TRAIN.RESUME: start_epoch, model, optimizer, ckpt_perf = load_checkpoint( model, optimizer, final_output_dir) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.TRAIN_DATASET)( config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)( config, config.DATASET.TEST_SUBSET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, collate_fn=totalcapture_collate, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, collate_fn=totalcapture_collate, pin_memory=True) best_perf = ckpt_perf best_epoch = -1 best_model = False for epoch in range(start_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() extra_param = dict() # extra_param['loss2'] = criterion_fuse train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, writer_dict, **extra_param) perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, writer_dict, **extra_param) logger.info( '=> perf indicator at epoch {} is {}. old best is {} '.format( epoch, perf_indicator, best_perf)) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True best_epoch = epoch logger.info( '====> find new best model at end of epoch {}. (start from 0)'. format(epoch)) else: best_model = False logger.info( 'epoch of best validation results is {}'.format(best_epoch)) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) # save final state at every epoch final_model_state_file = os.path.join( final_output_dir, 'final_state_ep{}.pth.tar'.format(epoch)) logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.' + config.MODEL.NAME + '.get_pose_net')( config, is_train=True) logger.info(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE, shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch """metabatch: args: 1.dataset_num 2.batchsize 3.total_epoch""" ##################### METABATCH ##################################### dataset_num = len(train_dataset) batch_size = config.TRAIN.BATCH_SIZE total_epoch = config.TRAIN.END_EPOCH logger.info('dataset_size={}, batchsize = {} ,total_epoch = {}'.format( dataset_num, batch_size, total_epoch)) SEU_YS = MetaData_Container(dataset_num, batch_size, total_epoch) ######################################################################### train(config, SEU_YS, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) SEU_YS.Output_CSV_Table() #每个周期输出表到csv文件并打印 ######################################################################### # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def validate(config, val_loader, val_dataset, model, criterion, output_dir, tb_log_dir, writer_dict=None): batch_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() # switch to evaluate mode model.eval() num_samples = len(val_dataset) all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32) all_boxes = np.zeros((num_samples, 6)) image_path = [] filenames = [] imgnums = [] idx = 0 with torch.no_grad(): end = time.time() for i, (input, target, target_weight, meta) in enumerate(val_loader): # compute output input = input.cuda() output = model(input) if config.TEST.FLIP_TEST: # this part is ugly, because pytorch has not supported negative index # input_flipped = model(input[:, :, :, ::-1]) input_flipped = np.flip(input.cpu().numpy(), 3).copy() input_flipped = torch.from_numpy(input_flipped).cuda() output_flipped = model(input_flipped) output_flipped = flip_back(output_flipped.cpu().numpy(), val_dataset.flip_pairs) output_flipped = torch.from_numpy(output_flipped.copy()).cuda() # feature is not aligned, shift flipped heatmap for higher accuracy if config.TEST.SHIFT_HEATMAP: output_flipped[:, :, :, 1:] = \ output_flipped.clone()[:, :, :, 0:-1] # output_flipped[:, :, :, 0] = 0 output = (output + output_flipped) * 0.5 target = target.cuda(non_blocking=True) target_weight = target_weight.cuda(non_blocking=True) loss = criterion(output, target, target_weight) num_images = input.size(0) # measure accuracy and record loss losses.update(loss.item(), num_images) _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(), target.cpu().numpy()) acc.update(avg_acc, cnt) # measure elapsed time batch_time.update(time.time() - end) end = time.time() c = meta['center'].numpy() s = meta['scale'].numpy() score = meta['score'].numpy() preds, maxvals = get_final_preds(config, output.clone().cpu().numpy(), c, s) all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2] all_preds[idx:idx + num_images, :, 2:3] = maxvals # double check this all_boxes parts all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2] all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2] all_boxes[idx:idx + num_images, 4] = np.prod(s * 200, 1) all_boxes[idx:idx + num_images, 5] = score image_path.extend(meta['image']) if config.DATASET.DATASET == 'posetrack': filenames.extend(meta['filename']) imgnums.extend(meta['imgnum'].numpy()) idx += num_images if i % config.PRINT_FREQ == 0: msg = 'Test: [{0}/{1}]\t' \ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \ 'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, acc=acc) logger.info(msg) prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i) save_debug_images(config, input, meta, target, pred * 4, output, prefix, target_weight) name_values, perf_indicator = val_dataset.evaluate( config, all_preds, output_dir, all_boxes, image_path, filenames, imgnums) _, full_arch_name = get_model_name(config) if isinstance(name_values, list): for name_value in name_values: _print_name_value(name_value, full_arch_name) else: _print_name_value(name_values, full_arch_name) if writer_dict: writer = writer_dict['writer'] global_steps = writer_dict['valid_global_steps'] writer.add_scalar('valid_loss', losses.avg, global_steps) writer.add_scalar('valid_acc', acc.avg, global_steps) if isinstance(name_values, list): for name_value in name_values: writer.add_scalars('valid', dict(name_value), global_steps) else: writer.add_scalars('valid', dict(name_values), global_steps) writer_dict['valid_global_steps'] = global_steps + 1 return perf_indicator
def validate(config, val_loader, val_dataset, model, criterion, output_dir, tb_log_dir, writer_dict=None): batch_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() # switch to evaluate mode model.eval() num_samples = len(val_dataset) all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32) all_boxes = np.zeros((num_samples, 6)) image_path = [] filenames = [] imgnums = [] idx = 0 with torch.no_grad(): end = time.time() for i, (input_, target, target_weight, meta) in enumerate(val_loader): # compute output if meta['image_id'] != 10003420000: continue root = config.DATASET.ROOT file_name = index_to_path(root, meta['image_id'][0].item()) data_numpy = cv2.imread( file_name, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) c_dt = meta['center'][0].numpy() s_dt = meta['scale'][0].numpy() r = 0 trans = get_affine_transform(c_dt, s_dt, r, config.MODEL.IMAGE_SIZE) input = cv2.warpAffine(data_numpy, trans, (int(config.MODEL.IMAGE_SIZE[0]), int(config.MODEL.IMAGE_SIZE[1])), flags=cv2.INTER_LINEAR) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.ToTensor(), normalize, ]) input = transform(input) # print(type(input)) # print(input.shape) new_input = np.zeros( [1, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0]]) new_input[0, :, :, :] = input[:, :, :] input = torch.from_numpy(new_input).float() output = model(input) if config.TEST.FLIP_TEST: # this part is ugly, because pytorch has not supported negative index # input_flipped = model(input[:, :, :, ::-1]) input_flipped = np.flip(input.cpu().numpy(), 3).copy() input_flipped = torch.from_numpy(input_flipped).cuda() output_flipped = model(input_flipped) output_flipped = flip_back(output_flipped.cpu().numpy(), val_dataset.flip_pairs) output_flipped = torch.from_numpy(output_flipped.copy()).cuda() # feature is not aligned, shift flipped heatmap for higher accuracy if config.TEST.SHIFT_HEATMAP: output_flipped[:, :, :, 1:] = \ output_flipped.clone()[:, :, :, 0:-1] # output_flipped[:, :, :, 0] = 0 output = (output + output_flipped) * 0.5 target = target.cuda(non_blocking=True) target_weight = target_weight.cuda(non_blocking=True) loss = criterion(output, target, target_weight) num_images = input.size(0) # measure accuracy and record loss losses.update(loss.item(), num_images) _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(), target.cpu().numpy()) acc.update(avg_acc, cnt) # measure elapsed time batch_time.update(time.time() - end) end = time.time() c = meta['center'].numpy() s = meta['scale'].numpy() score = meta['score'].numpy() c_d = meta['center'].numpy() s_d = meta['scale'].numpy() preds, maxvals = get_final_preds(config, output.clone().cpu().numpy(), c_d, s_d) print('id--{},\nkpts:\n{}'.format(meta['image_id'], preds[0])) # time.sleep(10) all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2] all_preds[idx:idx + num_images, :, 2:3] = maxvals # double check this all_boxes parts all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2] all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2] all_boxes[idx:idx + num_images, 4] = np.prod(s * 200, 1) all_boxes[idx:idx + num_images, 5] = score image_path.extend(meta['image']) # if config.DATASET.DATASET == 'posetrack': # filenames.extend(meta['filename']) # imgnums.extend(meta['imgnum'].numpy()) idx += num_images if i % config.PRINT_FREQ == 0: msg = 'Test: [{0}/{1}]\t' \ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \ 'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, acc=acc) logger.info(msg) prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i) save_debug_images(config, input, meta, target, pred * 4, output, prefix) name_values, perf_indicator = val_dataset.evaluate( config, all_preds, output_dir, all_boxes, image_path, filenames, imgnums) _, full_arch_name = get_model_name(config) if isinstance(name_values, list): for name_value in name_values: _print_name_value(name_value, full_arch_name) else: _print_name_value(name_values, full_arch_name) if writer_dict: writer = writer_dict['writer'] global_steps = writer_dict['valid_global_steps'] writer.add_scalar('valid_loss', losses.avg, global_steps) writer.add_scalar('valid_acc', acc.avg, global_steps) if isinstance(name_values, list): for name_value in name_values: writer.add_scalars('valid', dict(name_value), global_steps) else: writer.add_scalars('valid', dict(name_values), global_steps) writer_dict['valid_global_steps'] = global_steps + 1 return perf_indicator
def main(): torch.set_printoptions(precision=2, sci_mode=False, linewidth=300) args = parse_args() reset_config(config, args) run_phase = args.runMode # train or test logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, run_phase) model_file = 'final_state_ep{}.pth.tar'.format(args.modelFile) # print code version info try: repo = Repo('') repo_git = repo.git working_tree_diff_head = repo_git.diff('HEAD') this_commit_hash = repo.commit() cur_branches = repo_git.branch('--list') logger.info('Current Code Version is {}'.format(this_commit_hash)) logger.info('Current Branch Info :\n{}'.format(cur_branches)) logger.info( 'Working Tree diff with HEAD: \n{}'.format(working_tree_diff_head)) except: logger.info('Git repo not initialized') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED backbone_model = eval('models.' + config.BACKBONE_MODEL + '.get_pose_net')( config, is_train=True) model = models.adafuse_network.get_multiview_pose_net( backbone_model, config) writer_dict = { 'writer': SummaryWriter(tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # load pretrained backbone # Note this backbone is already trained on current dataset pretrained_backbone_file = Path( config.DATA_DIR) / config.NETWORK.PRETRAINED if os.path.exists(pretrained_backbone_file): model.load_state_dict(torch.load(pretrained_backbone_file), strict=False) if args.evaluate: run_phase = 'test' model_file_path = config.NETWORK.ADAFUSE model.load_state_dict(torch.load(model_file_path), strict=True) logger.info( '=> loading model from {} for evaluating'.format(model_file_path)) elif run_phase == 'test': model_state_file = os.path.join(final_output_dir, model_file) logger.info('=> loading model from {}'.format(model_state_file)) model.load_state_dict(torch.load(model_state_file), strict=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() criterion_mpjpe = JointMPJPELoss().cuda() view_weight_params = [] for name, param in model.named_parameters(): if 'view_weight_net' in name: param.requires_grad = True view_weight_params.append(param) else: param.requires_grad = False optimizer = torch.optim.Adam(params=view_weight_params, lr=config.TRAIN.LR) start_epoch = config.TRAIN.BEGIN_EPOCH if run_phase == 'train' and config.TRAIN.RESUME: start_epoch, model, optimizer, ckpt_perf = load_checkpoint( model, optimizer, final_output_dir) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if run_phase == 'train': train_dataset = eval('dataset.' + config.DATASET.TRAIN_DATASET)( config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, collate_fn=adafuse_collate, pin_memory=True) valid_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)( config, config.DATASET.TEST_SUBSET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, collate_fn=adafuse_collate, pin_memory=True) if run_phase == 'train': best_perf = ckpt_perf best_epoch = -1 best_model = False perf_indicator = 0 for epoch in range(start_epoch, config.TRAIN.END_EPOCH): lr_scheduler.step() extra_param = dict() extra_param['loss_mpjpe'] = criterion_mpjpe if run_phase == 'train': params = { 'config': config, 'dataset': train_dataset, 'loader': train_loader, 'model': model, 'criterion_mse': criterion, 'criterion_mpjpe': criterion_mpjpe, 'final_output_dir': final_output_dir, 'tb_writer': writer_dict, 'optimizer': optimizer, 'epoch': epoch, 'is_train': True, 'save_heatmaps': False, } # train run_model(**params) # save checkpoint and model before validation if divmod(epoch + 1, 1)[1] == 0: # save checkpoint every x epoch save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, False, final_output_dir, filename='checkpoint_ep{}.pth.tar'.format(epoch)) # save final state at every epoch final_model_state_file = os.path.join( final_output_dir, 'final_state_ep{}.pth.tar'.format(epoch)) logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) valid_params = { 'config': config, 'dataset': valid_dataset, 'loader': valid_loader, 'model': model, 'criterion_mse': criterion, 'criterion_mpjpe': criterion_mpjpe, 'final_output_dir': final_output_dir, 'tb_writer': writer_dict, 'optimizer': optimizer, 'epoch': epoch, 'is_train': False, 'save_heatmaps': False, } perf_indicator = run_model(**valid_params) if run_phase == 'test': break # if run mode is test, only run test one time is enough logger.info( '=> perf indicator at epoch {} is {}. old best is {} '.format( epoch, perf_indicator, best_perf)) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True best_epoch = epoch logger.info( '====> find new best model at end of epoch {}. (start from 0)'. format(epoch)) else: best_model = False logger.info( 'epoch of best validation results is {}'.format(best_epoch)) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) # --- End all epoch writer_dict['writer'].close()
def main(): args = parse_args() reset_config(config, args) # args里面存储,cfg文件里面配置的内容(学习率, batchsize, GPU设置) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') # logger.info(pprint.pformat(args)) # logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.' + config.MODEL.NAME + '.get_pose_net')( # config, is_train=True config, is_train=False) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(logdir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # 调用多GPU # if torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model, device_ids=[0,1]) # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( # dataset.mpii config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, train_set=True, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])) # 数据增强? train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), # 用1个GPU, batchsize为32 # batch_size=config.TRAIN.BATCH_SIZE * 2, # 用1个GPU, batchsize为32 shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # 训练20个epoch lr_scheduler.step() train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir) # , writer_dict) logger.info('=> saving checkpoint to {}'.format(final_output_dir)) if epoch % 1 == 0: save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), #'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) #
def train_normale(DANN, args, config): logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.' + config.MODEL.NAME + '.get_pose_net')( config, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() ################# AGGIUNTO ####################### criterion_dann = nn.CrossEntropyLoss() ######################################## optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) ################# AGGIUNTO ####################### if DANN == True: painting_dataset = eval('dataset.painting')( config, config.DATASET.ROOT, config.DATASET.PAINTING_SET, False, transforms.Compose([ #transforms.Resize(256), #transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) train_no_aug_dataset = eval('dataset.painting')( config, config.DATASET.ROOT, config.DATASET.TRAIN_NO_AUG_SET, False, transforms.Compose([ #transforms.Resize(256), #transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) ################################## train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) ################# AGGIUNTO ####################### if DANN == True: painting_loader = torch.utils.data.DataLoader( painting_dataset, #valid_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) #painting_loader=copy.deepcopy(valid_loader) train_no_aug_loader = torch.utils.data.DataLoader( train_no_aug_dataset, #valid_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) ################################## best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # train for one epoch ################# AGGIUNTO ####################### if DANN == True: alpha = 0.03 print(f"Alpha = {alpha}") model = train_dann(config, train_loader, train_no_aug_loader, painting_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, alpha, criterion_dann) else: train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) ################################## # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) lr_scheduler.step() if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') time_stamp = tb_log_dir.split('_')[-1] new_folder = os.path.join(final_output_dir, time_stamp) os.makedirs(new_folder) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.'+config.MODEL.NAME+'.get_pose_net_second_deconv')( config, is_train=True ).cuda() second_deconv = eval('models.'+config.MODEL.NAME+'.get_second_deconv')( config#, pretrained='output/coco/pose_resnet_50/256x192_d256x3_adam_lr1e-3/2021-02-15-03-49/model_best.pth.tar' ).cuda() # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # dump_input = torch.rand((config.TRAIN.BATCH_SIZE, # 3, # config.MODEL.IMAGE_SIZE[1], # config.MODEL.IMAGE_SIZE[0])) # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] # model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT, use_gain_loss=False ).cuda() # optimizer = get_optimizer(config, model) second_deconv_optimizer = get_optimizer(config, second_deconv) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( # optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR # ) second_deconv_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( second_deconv_optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR ) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ]) ) valid_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ]) ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE*len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True ) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # lr_scheduler.step() second_deconv_lr_scheduler.step() # train for one epoch train(config, train_loader, model, second_deconv, criterion, second_deconv_optimizer, epoch, new_folder, tb_log_dir, writer_dict) # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, second_deconv, criterion, new_folder, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(new_folder)) save_checkpoint({ 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': second_deconv.state_dict(), 'perf': perf_indicator, 'optimizer': second_deconv_optimizer.state_dict(), }, best_model, new_folder) final_model_state_file = os.path.join(new_folder, 'final_state.pth.tar') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() reset_config( config, args) # config has edict type and is imported from core.config logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') print('final_output_dir: ', final_output_dir) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.' + config.MODEL.NAME + '.get_pose_net')( config, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() best_perf = 0.0 best_model = False best_epoch = 0 # for continue training from checkpoint if os.path.isfile(config.MODEL.CHECKPOINT): logger.info( '=> load model from the checkpoint: config.MODEL.CHECKPOINT') ckp = torch.load(config.MODEL.CHECKPOINT) model.load_state_dict(ckp['state_dict']) writer_dict['train_global_steps'] = ckp['epoch'] writer_dict['valid_global_steps'] = writer_dict['train_global_steps'] config.TRAIN.BEGIN_EPOCH = writer_dict['train_global_steps'] best_perf = ckp['best_perf'] best_epoch = ckp['best_epoch'] config.TRAIN.LR_STEP = [ lr - config.TRAIN.BEGIN_EPOCH for lr in config.TRAIN.LR_STEP ] logger.info('begin_epoch: {}'.format(config.TRAIN.BEGIN_EPOCH)) logger.info('train_global_steps: {}'.format( writer_dict['train_global_steps'])) logger.info('valid_global_steps: {}'.format( writer_dict['valid_global_steps'])) logger.info('best_perf: {}'.format(best_perf)) logger.info('best_epoch: {}'.format(best_epoch)) logger.info('lr_step: {}'.format(config.TRAIN.LR_STEP)) for lr in config.TRAIN.LR_STEP: if lr < 0: logger.info( 'ERROR: learning rate must be larger than 0, but lr_step is {}' .format(config.TRAIN.LR_STEP)) exit() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) lr_scheduler.step() perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True best_epoch = epoch logger.info('=> best model, epoch: {}, perf: {}'.format( epoch, perf_indicator)) else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'best_perf': best_perf, 'best_epoch': best_epoch, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): args = parse_args() cf.args = args reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') tb_log_dir = pathjoin( dirname(tb_log_dir), 'w%s,m%s,rs%s,t%s_' % (args.pointMaxW, args.probMargin, ''.join(map(str, args.rs)), args.t) + basename(tb_log_dir)) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.' + config.MODEL.NAME + '.get_pose_net')( config, is_train=True) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) #writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() # define loss function (criterion) and optimizer # criterion = JointsMSELoss( # use_target_weight=config.LOSS.USE_TARGET_WEIGHT # ).cuda() if config.TRAIN.CRITERION == 'msssm_mean': criterion = MultiScaleSpatialSoftmax(log_freq=60 * 10, cyc_rs=args.rs, poolings=['avg', 'max'][:], pointMaxW=args.pointMaxW, probMargin=args.probMargin, temper=args.t) # p[1, 4, 10]* m[0, .5, .8] # criterion = MultiScaleSpatialSoftMax( poolings=['avg', 'max'], pointMaxW=1) # criterion = MultiScaleSpatialSoftMax(cyc_rs=[8, 4, 2, ], pointMaxW=1) elif config.TRAIN.CRITERION == 'ssm_mean': criterion = SpatialSoftmax() # criterion = torch.nn.DataParallel(criterion, device_ids=gpus).cuda() # cf.debugPoinMax = 30 cf.debugPoinMax = False if cf.debugPoinMax: criterion = MultiScaleSpatialSoftmax( log_freq=30, cyc_rs=[], poolings=['avg', 'max'][:], pointMaxW=args.pointMaxW, ) optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close() print(args)
def main(): args = parse_args() reset_config(config, args) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) if args.useOneDrive == True: oneDriveLogger = OneDriveLogger() else: oneDriveLogger = None # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED ''' model = MobileNet_() model.load_state_dict(torch.load(args.resume)) model.heatmap = nn.Conv2d(1024, 16, 1, bias=False) model.offset = nn.Conv2d(1024, 16*2, 1, bias=False) model.offset.weight.data = torch.from_numpy(np.zeros_like(model.offset.weight.data)) model.heatmap.weight.data = torch.from_numpy(np.zeros_like(model.heatmap.weight.data)) lastestname = os.path.join(final_output_dir, 'renew') torch.save(model.state_dict(), lastestname + '.model') ''' if config.MODEL.NAME == "MobileNet16_": model = MobileNet16_() elif config.MODEL.NAME == "MnasNet16_": model = MnasNet_() elif config.MODEL.NAME == "MobileNet162_": model = MobileNet162_() else: model = eval(config.MODEL.NAME)() optimizer_state_dict = None if args.resume: ''' checkpoint = torch.load(args.resume) state_dict = checkpoint['state_dict'] # create new OrderedDict that does not contain `module.` from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v # load params model.load_state_dict(new_state_dict) optimizer_state_dict = checkpoint['optimizer'] ''' checkpoint = torch.load(args.resume) state_dict = checkpoint['state_dict'] model.load_state_dict(state_dict) optimizer_state_dict = checkpoint['optimizer'] #model.load_state_dict(torch.load(args.resume)) ''' optimizer = get_optimizer(config, model) for p in model.model.parameters(): p.requires_grad = False heatmap_data = model.heatmap.weight.data model.heatmap = nn.Conv2d(1024, 16, 1, bias=False) model.offset = nn.Conv2d(1024, 16*2, 1, bias=False) model.heatmap.weight.data = heatmap_data model.offset.weight.data = torch.from_numpy(np.zeros_like(model.offset.weight.data)) ''' #model.model2 = None ''' def conv_dw(inp, oup, stride): return nn.Sequential( nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True), nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup), nn.ReLU(inplace=True), ) model.model1_1_2 = conv_dw(256, 256, 1) model.model1_7_2 = conv_dw(512, 512, 1) ''' writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand( (config.TRAIN.BATCH_SIZE, config.MODEL.NUM_JOINTS, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) #writer_dict['writer'].add_graph(model, (dump_input, )) gpus = [int(i) for i in config.GPUS.split(',')] #model = torch.nn.DataParallel(model, device_ids=gpus).cuda() model.cuda() # define loss function (criterion) and optimizer criterion = JointsMSELossCoco( use_target_weight=config.LOSS.USE_TARGET_WEIGHT, heatmap_size=config.MODEL.EXTRA.HEATMAP_SIZE[0]).cuda() optimizer = get_optimizer(config, model) if optimizer_state_dict != None: optimizer.load_state_dict(optimizer_state_dict) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR) # Data loading code normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) train_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), #normalize, ])) valid_dataset = eval('dataset.' + config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), #normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE * len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE * len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True) best_perf = 0.0 best_model = False for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, oneDriveLogger, args.useOffset) filename = os.path.join(final_output_dir, 'epoch-{0}'.format(epoch + 1)) torch.save(model.state_dict(), filename + '.model') lastestname = os.path.join(final_output_dir, 'lastest') torch.save(model.state_dict(), lastestname + '.model') if args.useOneDrive == True: torch.save(model.state_dict(), 'C:/Users/aoyag/OneDrive/pytorch/lastest.model') # evaluate on validation set perf_indicator = validate(config, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict, oneDriveLogger, args.useOffset) if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info( 'saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file)
def validate(config, loader, dataset, model_dict, criterion_dict, output_dir, writer_dict, # None rank): # only rank 0 process will enter this function device = torch.device('cuda', rank) for model in model_dict.values(): model.eval() batch_time = AverageMeter() losses = AverageMeter() avg_acc = AverageMeter() nsamples = len(dataset) * 4 is_aggre = config.NETWORK.AGGRE njoints = config.NETWORK.NUM_JOINTS height = int(config.NETWORK.HEATMAP_SIZE[0]) width = int(config.NETWORK.HEATMAP_SIZE[1]) all_preds = np.zeros((nsamples, njoints, 3), dtype=np.float32) all_heatmaps = np.zeros( (nsamples, njoints, height, width), dtype=np.float32) idx = 0 with torch.no_grad(): end = time.time() for i, (input, target, weight, meta) in enumerate(loader): input = [view.to(device, non_blocking=False) for view in input] raw_features, aggre_features, _, _ = model_dict['base_model'](input) if is_aggre and config.TEST.FUSE_OUTPUT: output = fuse_routing(raw_features, aggre_features, is_aggre, meta) else: output = raw_features if config.TEST.FLIP_TEST: # only support MPII flip # input : a list of [N, 3, H, W] input_flipped = [torch.flip(view, dims=[3]).to(device, non_blocking=False) for view in input] raw_features_flipped, aggre_features_flipped, _, _ = model_dict['base_model'](input_flipped) if is_aggre and config.TEST.FUSE_OUTPUT: output_flipped = fuse_routing(raw_features_flipped, aggre_features_flipped, is_aggre, meta) else: output_flipped = raw_features_flipped output_flipped = flip_back_th(output_flipped, dataset.flip_pairs) if config.TEST.SHIFT_HEATMAP: # in-place shift for view in output_flipped: view[:, :, :, 1:] = view.clone()[:, :, :, 0:-1] output = [(view + view_flipped)*0.5 for view, view_flipped in zip(output, output_flipped)] loss = 0 target_cuda = [] weight_cuda = [] # loss on single view, with ground truth heat maps for t, w, r in zip(target, weight, raw_features): t = t.to(device, non_blocking=False) w = w.to(device, non_blocking=False) target_cuda.append(t) weight_cuda.append(w) loss += criterion_dict['mse_weights'](r, t, w) # loss on multivew h36m, consistent loss if is_aggre: if config.LOSS.USE_CONSISTENT_LOSS: raw_h36m, agg_h36m = select_out_h36m(raw_features, aggre_features, meta) assert len(raw_h36m[0]) == len(agg_h36m[0]) if len(raw_h36m[0]) != 0: raw_h36m, agg_h36m = torch.cat(raw_h36m, dim=0), torch.cat(agg_h36m, dim=0) loss += criterion_dict['mse'](raw_h36m, agg_h36m) if config.DATASET.PSEUDO_LABEL_PATH: # mse loss on output, with pseudo heat maps for t, w, o in zip(target_cuda, weight_cuda, output): cal_loss = criterion_dict['mse_weights'](o, t, w) loss += cal_loss * config.LOSS.MSE_LOSS_WEIGHT nimgs = len(input) * input[0].size(0) losses.update(loss.item(), nimgs) nviews = len(output) acc = [None] * nviews cnt = [None] * nviews pre = [None] * nviews for j in range(nviews): _, acc[j], cnt[j], pre[j] = accuracy( output[j].detach().cpu().numpy(), target_cuda[j].detach().cpu().numpy()) acc = np.mean(acc) cnt = np.mean(cnt) avg_acc.update(acc, cnt) batch_time.update(time.time() - end) end = time.time() preds = np.zeros((nimgs, njoints, 3), dtype=np.float32) heatmaps = np.zeros( (nimgs, njoints, height, width), dtype=np.float32) for k, o, m in zip(range(nviews), output, meta): pred, maxval = get_final_preds(config, o.clone().cpu().numpy(), m['center'].numpy(), m['scale'].numpy()) pred = pred[:, :, 0:2] pred = np.concatenate((pred, maxval), axis=2) preds[k::nviews] = pred heatmaps[k::nviews] = o.clone().cpu().numpy() all_preds[idx:idx + nimgs] = preds all_heatmaps[idx:idx + nimgs] = heatmaps idx += nimgs if i % config.PRINT_FREQ == 0 and rank == 0: msg = 'Test: [{0}/{1}]\t' \ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \ 'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format( i, len(loader), batch_time=batch_time, loss=losses, acc=avg_acc) logger.info(msg) for k in range(len(input)): view_name = 'view_{}'.format(k + 1) prefix = '{}_{}_{:08}'.format( os.path.join(output_dir, 'validation'), view_name, i) save_debug_images(config, input[k], meta[k], target_cuda[k], pre[k] * 4, output[k], prefix) perf_indicator = 1000 if rank == 0: # save heatmaps and joint locations u2a = dataset.u2a_mapping u2a = {k:v for k, v in u2a.items() if v != '*'} sorted_u2a = sorted(u2a.items(), key=lambda x: x[0]) u = np.array([mapping[0] for mapping in sorted_u2a]) file_name = os.path.join(output_dir, 'heatmaps_locations_%s_%s.h5' % (dataset.subset, dataset.dataset_type)) file = h5py.File(file_name, 'w') file['heatmaps'] = all_heatmaps[:, u, :, :] file['locations'] = all_preds[:, u, :] file['joint_names_order'] = u # names order in union(mpii) dataset file.close() name_value, perf_indicator = dataset.evaluate(all_preds[:, u, :], output_dir if config.DEBUG.SAVE_ALL_PREDS else None) names = name_value.keys() values = name_value.values() num_values = len(name_value) _, full_arch_name = get_model_name(config) logger.info('| Arch ' + ' '.join(['| {}'.format(name) for name in names]) + ' |') logger.info('|---' * (num_values + 1) + '|') logger.info('| ' + full_arch_name + ' ' + ' '.join(['| {:.3f}'.format(value) for value in values]) + ' |') return perf_indicator
def validate(config, val_loader, val_dataset, model, criterion, pointcri, anglecri, output_dir, tb_log_dir, writer_dict=None): batch_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() #lossAngle = AverageMeter() lossPoint = AverageMeter() lossScore = AverageMeter() accPearson = AverageMeter() accMAE = AverageMeter() # switch to evaluate mode model.eval() num_samples = len(val_dataset) #all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 2), dtype=np.float32) #ori landmark model all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32) all_preds_point = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32) #all_boxes = np.zeros((num_samples, 6)) all_boxes = np.zeros((num_samples, 22)) all_boxes_point = np.zeros((num_samples, 6)) image_path = [] filenames = [] imgnums = [] idx = 0 with torch.no_grad(): end = time.time() for i, (input, target, target_weight, meta, points) in enumerate(val_loader): # compute output outputs = model(input) if isinstance(outputs, list): output = outputs # output = outputs[-1] else: output = outputs # output = output[0] #import pdb #pdb.set_trace() if config.TEST.FLIP_TEST: # this part is ugly, because pytorch has not supported negative index # input_flipped = model(input[:, :, :, ::-1]) input_flipped = np.flip(input.cpu().numpy(), 3).copy() input_flipped = torch.from_numpy(input_flipped).cuda() outputs_flipped = model(input_flipped) if isinstance(outputs_flipped, list): output_flipped = outputs_flipped[0] else: output_flipped = outputs_flipped output_flipped = flip_back(output_flipped.cpu().numpy(), val_dataset.flip_pairs) output_flipped = torch.from_numpy(output_flipped.copy()).cuda() # feature is not aligned, shift flipped heatmap for higher accuracy if config.TEST.SHIFT_HEATMAP: output_flipped[:, :, :, 1:] = \ output_flipped.clone()[:, :, :, 0:-1] output = (output + output_flipped) * 0.5 target = target.cuda(non_blocking=True) target_weight = target_weight.cuda(non_blocking=True) points = points.cuda(non_blocking=True) input_w = config.MODEL.IMAGE_SIZE[0] input_h = config.MODEL.IMAGE_SIZE[1] scoreloss = criterion(output, target, target_weight) #pointloss = pointcri(output, points, input_w, input_h) aa = 1 #loss = 1*angleloss + 0.1*pointloss + 0*scoreloss #loss = (1-aa)*pointloss + aa*scoreloss loss = scoreloss num_images = input.size(0) # measure accuracy and record loss losses.update(loss.item(), num_images) #lossPoint.update(pointloss.item(), input.size(0)) lossScore.update(scoreloss.item(), input.size(0)) _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(), target.cpu().numpy()) acc.update(avg_acc, cnt) # measure elapsed time batch_time.update(time.time() - end) end = time.time() c = meta['center'].numpy() s = meta['scale'].numpy() r = meta['rotation'].numpy() score = meta['score'].numpy() w_rate = meta['w_rate'] h_rate = meta['h_rate'] box_list = meta['box_list'].numpy() id = meta['id'].numpy() joints_vis = meta['joints_vis'][:, :, 0].numpy() #shape = [num_joints] scoremap_height = output.shape[2] scoremap_width = output.shape[3] preds, maxvals = get_final_preds(config, output.clone().cpu().numpy(), c, s) all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2] all_preds[idx:idx + num_images, :, 2:3] = maxvals # double check this all_boxes parts all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2] all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2] all_boxes[idx:idx + num_images, 4] = np.prod(s * 200, 1) all_boxes[idx:idx + num_images, 5] = score all_boxes[idx:idx + num_images, 6:9] = box_list[:, 0:3] all_boxes[idx:idx + num_images, 9] = id all_boxes[idx:idx + num_images, 10:22] = joints_vis[:, 0:12] image_path.extend(meta['image']) #import pdb #pdb.set_trace() idx += num_images if i % config.PRINT_FREQ == 0: msg = 'Test: [{0}/{1}]\t' \ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \ 'lossScore {scoreloss.val:.5f} ({scoreloss.avg:.5f})\t' \ 'Loss {loss.val:.5f} ({loss.avg:.5f})\t' \ 'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, scoreloss=lossScore, loss=losses, acc=acc) logger.info(msg) prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i) save_debug_images(config, input, meta, target, pred * 4, output, prefix) #import pdb #pdb.set_trace() name_values, perf_indicator = val_dataset.evaluate( config, all_preds, output_dir, all_boxes, image_path, filenames, imgnums) _, full_arch_name = get_model_name(config) if isinstance(name_values, list): for name_value in name_values: _print_name_value(name_value, full_arch_name) else: _print_name_value(name_values, full_arch_name) if writer_dict: writer = writer_dict['writer'] global_steps = writer_dict['valid_global_steps'] writer.add_scalar('valid_loss', losses.avg, global_steps) writer.add_scalar('valid_acc', acc.avg, global_steps) if isinstance(name_values, list): for name_value in name_values: writer.add_scalars('valid', dict(name_value), global_steps) else: writer.add_scalars('valid', dict(name_values), global_steps) writer_dict['valid_global_steps'] = global_steps + 1 return perf_indicator
def main(): args = parse_args() reset_config(config, args) if args.prevModelDir and args.modelDir: # copy pre models for philly copy_prev_models(args.prevModelDir, args.modelDir) logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'train') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = eval('models.'+config.MODEL.NAME+'.get_pose_net')( config, is_train=True ) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'), final_output_dir) # logger.info(pprint.pformat(model)) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } dump_input = torch.rand((config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0])) #writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False) #logger.info(get_model_summary(model, dump_input)) gpus = [int(i) for i in config.GPUS.split(',')] model = torch.nn.DataParallel(model, device_ids=gpus).cuda() train_layer=['module.final_sigmoid','module.dropout','module.final_linear.weight','module.final_linear.bias','module.output_conv_final.weight','module.final_conv.weight','module.final_conv.bias'] #train_layer=['module.final_layer_one.weight','module.final_layer_one.bias','module.final_linear.weight','module.final_linear.bias','module.out_linear.weight','module.out_linear.bias','module.output_conv_final.weight'] #for k,v in model.named_parameters(): #if k not in train_layer: #v.requires_grad=False #import pdb #pdb.set_trace() # define loss function (criterion) and optimizer criterion = JointsMSELoss( use_target_weight=config.LOSS.USE_TARGET_WEIGHT ).cuda() anglecri=AnglesLoss(use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() pointcri = PointsLoss(use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda() normalize = transforms.Normalize(mean=[0.43, 0.43, 0.43], std=[0.223, 0.223, 0.223]) #for SpineWeb CLAHE optimizer = get_optimizer(config, model) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR ) train_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ]) ) valid_dataset = eval('dataset.'+config.DATASET.DATASET)( config, config.DATASET.ROOT, config.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ]) ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN.BATCH_SIZE*len(gpus), shuffle=config.TRAIN.SHUFFLE, num_workers=config.WORKERS, pin_memory=True ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TEST.BATCH_SIZE*len(gpus), shuffle=False, num_workers=config.WORKERS, pin_memory=True ) best_perf = 0.0 best_model = True last_epoch = -1 optimizer = get_optimizer(config, model) begin_epoch = config.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join( final_output_dir, 'checkpoint.pth' ) if config.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR, last_epoch=last_epoch ) for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH): lr_scheduler.step() # train for one epoch train(config, train_loader, model, criterion,pointcri,anglecri, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # evaluate on validation set perf_indicator = validate( config, valid_loader, valid_dataset, model, criterion,pointcri,anglecri, final_output_dir, tb_log_dir, writer_dict ) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) if best_model == True: best_model_state_file = os.path.join(final_output_dir, 'best_state.pth.tar') logger.info('saving best model state to {}'.format( best_model_state_file)) torch.save(model.module.state_dict(), best_model_state_file) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth.tar') logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()