def dataloader(bs, gpus): roidb, ratio_list, ratio_index = \ combined_roidb_for_training_semseg('cityscapes_semseg_val') sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, 19, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=bs, sampler=sampler, num_workers=gpus, collate_fn=collate_minibatch_semseg) return dataloader
def dataloader(bs, gpus): roidb, ratio_list, ratio_index = \ combined_roidb_for_training_semseg('cityscapes_semseg_train') sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, 19, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=bs, sampler=sampler, num_workers=gpus, collate_fn=collate_minibatch_semseg) return dataloader return torch.randn(bs*gpus, 3, 720, 720), \ torch.LongTensor(np.random.randint(0, 19, (bs*gpus, 90, 90), dtype=np.long))
def main(): saveNetStructure=False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark=True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscape_train_on_val": cfg.TRAIN.DATASETS = ('cityscape_train_on_val', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_coarse": cfg.TRAIN.DATASETS = ('cityscapes_coarse', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_all": cfg.TRAIN.DATASETS = ('cityscapes_all', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_trainval": cfg.TRAIN.DATASETS = ('cityscapes_trainval', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_fineturn": cfg.TRAIN.DATASETS = ('cityscapes_fineturn', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) #sampler = MinibatchSampler(ratio_list, ratio_index) sampler = None dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg_all if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch, drop_last=False, shuffle=True, pin_memory=True) assert_and_infer_cfg() #for data in dataloader: # image = data['data'][0][0].numpy() # print (image.shape) # image=image.transpose(1,2,0)+cfg.PIXEL_MEANS # cv2.imwrite('image.png', image[:,:,::-1]) # cv2.imwrite('label.png',10*data['semseg_label_0'][0][0].numpy()) # return maskRCNN = eval(cfg.MODEL.TYPE)() if len(cfg.SEM.PSPNET_PRETRAINED_WEIGHTS)>1: print("loading pspnet weights") state_dict={} pretrained=torch.load(cfg.SEM.PSPNET_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) pretrained = pretrained['model'] if cfg.SEM.SPN_ON: maskRCNN.pspnet.load_state_dict(pretrained,strict=True) elif 'deeplab' in cfg.SEM.DECODER_TYPE: encoder = dict() for k, v in pretrained.items(): if 'decoder' in k: continue encoder[k.replace('encoder.','')] = v maskRCNN.encoder.load_state_dict(encoder,strict=True) del encoder else: maskRCNN.load_state_dict(pretrained,strict=True) del pretrained print("weights load success") if cfg.SEM.SPN_ON: maskRCNN.pspnet.eval() for p in maskRCNN.pspnet.parameters(): p.requires_grad = False # load nets into gpu maskRCNN = UserScatteredDataParallel(maskRCNN) # For sync bn patch_replication_callback(maskRCNN) if cfg.CUDA: maskRCNN.to('cuda') ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [ {'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) print("Using STEP as Lr reduce policy!") if cfg.SOLVER.TYPE == 'SGD' and cfg.SOLVER.LR_POLICY == 'ReduceLROnPlateau': optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience=10) print("Using ReduceLROnPlateau as Lr reduce policy!") elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) elif "poly" in cfg.SOLVER.TYPE: optimizer = create_optimizers(maskRCNN,args) print("Using Poly as Lr reduce policy!") args.max_iters = (int(train_size / args.batch_size)) * args.num_epochs ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) if cfg.SOLVER.TYPE=='step_poly': lr = cfg.SOLVER.BASE_LR / (cfg.SOLVER.GAMMA**len(args.lr_decay_epochs)) else: lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[0] and args.start_iter == 0 and cfg.SOLVER.LR_POLICY=='steps_with_decay' : args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip(range(args.start_iter, iters_per_epoch), dataloader): #if cfg.DISP.DISP_ON: # input_data['data'] = list(map(lambda x,y: torch.cat((x,y), dim=0), # input_data['data'], input_data['data_R'])) # if cfg.SEM.DECODER_TYPE.endswith('3D'): # input_data['disp_scans'] = torch.arange(1, # cfg.DISP.MAX_DISPLACEMENT+1).float().view(1,cfg.DISP.MAX_DISPLACEMENT).repeat(args.batch_size,1) # del input_data['data_R'] #for key in input_data: # if key != 'roidb': # roidb is a list of ndarrays with inconsistent length # input_data[key] = list(map(lambda x: Variable(x, requires_grad=False).to('cuda'), input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizer.zero_grad() loss.backward() optimizer.step() if cfg.SOLVER.TYPE=='poly': lr = adjust_learning_rate(optimizer, global_step, args) if cfg.SOLVER.TYPE=='step_poly': lr = step_adjust_learning_rate(optimizer, lr, global_step, args) training_stats.IterToc() if args.step % args.disp_interval == 0: disp_image='' semseg_image='' #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint if cfg.SOLVER.TYPE == 'SGD' and cfg.SOLVER.LR_POLICY == 'ReduceLROnPlateau': lr_scheduler.step(loss) lr = optimizer.param_groups[0]['lr'] if (args.epoch+1) % args.ckpt_num_per_epoch ==0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) # save final model if (args.epoch+1) % args.ckpt_num_per_epoch: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "common": cfg.TRAIN.DATASETS = ('common_train', ) cfg.MODEL.NUM_CLASSES = 81 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) assert_and_infer_cfg() ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [{ 'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs) loss = net_outputs['total_loss'] optimizer.zero_grad() loss.backward() optimizer.step() training_stats.IterToc() if (args.step + 1) % ckpt_interval_per_epoch == 0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) if args.step % args.disp_interval == 0: log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- if iters_per_epoch % args.disp_interval != 0: # log last stats at the end log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name+'.weight') gn_param_nameset.add(name+'.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in maskRCNN.named_parameters(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN} ] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. optimizer.load_state_dict(checkpoint['optimizer']) # misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step+1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, drop_last=True, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 assert checkpoint['train_size'] == train_size # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, run_name) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] > args.start_step: decay_steps_ind = i if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) logger.info('Training starts !') loss_avg = 0 try: timers['train_loop'].tic() for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: lr = cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = lr_new decay_steps_ind += 1 try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) outputs = maskRCNN(**input_data) rois_label = outputs['rois_label'] cls_score = outputs['cls_score'] bbox_pred = outputs['bbox_pred'] loss_rpn_cls = outputs['loss_rpn_cls'].mean() loss_rpn_bbox = outputs['loss_rpn_bbox'].mean() loss_rcnn_cls = outputs['loss_rcnn_cls'].mean() loss_rcnn_bbox = outputs['loss_rcnn_bbox'].mean() loss = loss_rpn_cls + loss_rpn_bbox + loss_rcnn_cls + loss_rcnn_bbox if cfg.MODEL.MASK_ON: loss_rcnn_mask = outputs['loss_rcnn_mask'].mean() loss += loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = outputs['loss_rcnn_keypoints'].mean() loss += loss_rcnn_keypoints loss_avg += loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() optimizer.step() if (step + 1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) if ((step % args.disp_interval == 0 and (step - args.start_step >= args.disp_interval)) or step == cfg.SOLVER.MAX_ITER - 1): diff = timers['train_loop'].toc(average=False) loss_avg /= args.disp_interval loss_rpn_cls = loss_rpn_cls.data[0] loss_rpn_bbox = loss_rpn_bbox.data[0] loss_rcnn_cls = loss_rcnn_cls.data[0] loss_rcnn_bbox = loss_rcnn_bbox.data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[ %s ][ step %d ]" % (run_name, step)) print("\t\tloss: %.4f, lr: %.2e" % (loss_avg, lr)) print("\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, diff)) print( "\t\trpn_cls: %.4f, rpn_bbox: %.4f, rcnn_cls: %.4f, rcnn_bbox %.4f" % (loss_rpn_cls, loss_rpn_bbox, loss_rcnn_cls, loss_rcnn_bbox)) print_prefix = "\t\t" if cfg.MODEL.MASK_ON: loss_rcnn_mask = loss_rcnn_mask.data[0] print("%srcnn_mask %.4f" % (print_prefix, loss_rcnn_mask)) print_prefix = ", " if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = loss_rcnn_keypoints.data[0] print("%srcnn_keypoints %.4f" % (print_prefix, loss_rcnn_keypoints)) if args.use_tfboard and not args.no_save: info = { 'lr': lr, 'loss': loss_avg, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_bbox, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_bbox, } if cfg.MODEL.MASK_ON: info['loss_rcnn_mask'] = loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: info['loss_rcnn_keypoints'] = loss_rcnn_keypoints for tag, value in info.items(): tblogger.add_scalar(tag, value, step) loss_avg = 0 timers['train_loop'].tic() # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt) as e: print('Save on exception:', e) save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) stack_trace = traceback.format_exc() print(stack_trace) finally: # ---- Training ends ---- if args.use_tfboard and not args.no_save: tblogger.close()
def main(): saveNetStructure = False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark = True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch) assert_and_infer_cfg() #for args.step, input_data in zip(range(100), dataloader): # data_L = input_data['data'] # data_R = input_data['data_R'] # label = input_data['disp_label_0'] # cv2.imwrite('ims_L.png', data_L[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('ims_R.png', data_R[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('label.png', label[0].numpy()[0]) # return ### Model ### dispSeg = DispSeg() if cfg.CUDA: dispSeg.to('cuda') pspnet_bias_params = [] pspnet_nonbias_params = [] for key, value in dict(dispSeg.pspnet.named_parameters()).items(): if value.requires_grad: if 'bias' in key: pspnet_bias_params.append(value) else: pspnet_nonbias_params.append(value) pspnet_params = [{ 'params': pspnet_nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': pspnet_bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] glassGCN_bias_params = [] glassGCN_nonbias_params = [] for key, value in dict(dispSeg.glassGCN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: glassGCN_bias_params.append(value) else: glassGCN_nonbias_params.append(value) segdisp3d_params = [{ 'params': glassGCN_nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': glassGCN_bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizerP = torch.optim.SGD(pspnet_params, momentum=cfg.SOLVER.MOMENTUM) optimizerS = torch.optim.SGD(segdisp3d_params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizerP = torch.optim.Adam(pspnet_params) optimizerS = torch.optim.Adam(segdisp3d_params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(pspnet, checkpoint['model']) net_utils.load_ckpt(segdisp3d, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() lr = optimizerP.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. dispSeg = mynn.DataParallel(dispSeg, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### dispSeg.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizerP, lr, cfg.SOLVER.GAMMA) net_utils.decay_learning_rate(optimizerS, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): if cfg.DISP.DISP_ON: input_data['data'] = list( map(lambda x, y: torch.cat((x, y), dim=0), input_data['data'], input_data['data_R'])) if cfg.SEM.DECODER_TYPE.endswith('3Ddeepsup'): input_data['disp_scans'] = torch.arange( 0, cfg.DISP.MAX_DISPLACEMENT).float().view( 1, cfg.DISP.MAX_DISPLACEMENT, 1, 1).repeat(args.batch_size, 1, 1, 1) input_data['semseg_scans'] = torch.arange( 0, cfg.MODEL.NUM_CLASSES).long().view( 1, cfg.MODEL.NUM_CLASSES, 1, 1).repeat(args.batch_size, 1, 1, 1) del input_data['data_R'] for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map( lambda x: Variable(x, requires_grad=False).to( 'cuda'), input_data[key])) training_stats.IterTic() net_outputs = dispSeg(**input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizerP.zero_grad() optimizerS.zero_grad() loss.backward() optimizerP.step() optimizerS.step() training_stats.IterToc() if args.step % args.disp_interval == 0: #disp_image=net_outputs['disp_image'] #semseg_image=net_outputs['semseg_image'] #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, dispSeg, optimizerS) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, dispSeg, optimizerS) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma args.mGPUs = (cfg.NUM_GPUS > 1) timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) assert_and_infer_cfg() ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [ {'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, run_name) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() iters_per_epoch = int(train_size / args.batch_size) # drop last ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch step = 0 try: logger.info('Training starts !') for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- loss_avg = 0 timers['train_loop'].tic() # adjust learning rate if args.lr_decay_epochs and epoch == args.lr_decay_epochs[0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for step, input_data in zip(range(args.start_iter, iters_per_epoch), dataloader): for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) outputs = maskRCNN(**input_data) rois_label = outputs['rois_label'] cls_score = outputs['cls_score'] bbox_pred = outputs['bbox_pred'] loss_rpn_cls = outputs['loss_rpn_cls'].mean() loss_rpn_bbox = outputs['loss_rpn_bbox'].mean() loss_rcnn_cls = outputs['loss_rcnn_cls'].mean() loss_rcnn_bbox = outputs['loss_rcnn_bbox'].mean() loss = loss_rpn_cls + loss_rpn_bbox + loss_rcnn_cls + loss_rcnn_bbox if cfg.MODEL.MASK_ON: loss_rcnn_mask = outputs['loss_rcnn_mask'].mean() loss += loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = outputs['loss_rcnn_keypoints'].mean() loss += loss_rcnn_keypoints loss_avg += loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() optimizer.step() if (step+1) % ckpt_interval_per_epoch == 0: net_utils.save_ckpt(output_dir, args, epoch, step, maskRCNN, optimizer, iters_per_epoch) if (step+1) % args.disp_interval == 0: if (step + 1 - args.start_iter) >= args.disp_interval: # for the case of resume diff = timers['train_loop'].toc(average=False) loss_avg /= args.disp_interval loss_rpn_cls = loss_rpn_cls.data[0] loss_rpn_bbox = loss_rpn_bbox.data[0] loss_rcnn_cls = loss_rcnn_cls.data[0] loss_rcnn_bbox = loss_rcnn_bbox.data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[%s][epoch %2d][iter %4d / %4d]" % (run_name, epoch, step, iters_per_epoch)) print("\t\tloss: %.4f, lr: %.2e" % (loss_avg, lr)) print("\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, diff)) print("\t\trpn_cls: %.4f, rpn_bbox: %.4f, rcnn_cls: %.4f, rcnn_bbox %.4f" % (loss_rpn_cls, loss_rpn_bbox, loss_rcnn_cls, loss_rcnn_bbox)) print_prefix = "\t\t" if cfg.MODEL.MASK_ON: loss_rcnn_mask = loss_rcnn_mask.data[0] print("%srcnn_mask %.4f" % (print_prefix, loss_rcnn_mask)) print_prefix = ", " if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = loss_rcnn_keypoints.data[0] print("%srcnn_keypoints %.4f" % (print_prefix, loss_rcnn_keypoints)) if args.use_tfboard: info = { 'loss': loss_avg, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_bbox, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_bbox, } if cfg.MODEL.MASK_ON: info['loss_rcnn_mask'] = loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: info['loss_rcnn_keypoints'] = loss_rcnn_keypoints for tag, value in info.items(): tblogger.add_scalar(tag, value, iters_per_epoch * epoch + step) loss_avg = 0 timers['train_loop'].tic() # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, epoch, step, maskRCNN, optimizer, iters_per_epoch) # reset timer timers['train_loop'].reset() # reset starting iter number after first epoch args.start_iter = 0 except (RuntimeError, KeyboardInterrupt) as e: print('Save on exception:', e) net_utils.save_ckpt(output_dir, args, epoch, step, maskRCNN, optimizer, iters_per_epoch) stack_trace = traceback.format_exc() print(stack_trace) finally: # ---- Training ends ---- if args.use_tfboard: tblogger.close()
def main(): if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the training code, sry bro :(.") else: cfg.CUDA = True cfg.NUM_GPUS = torch.cuda.device_count() #######~~~.Parameters stuff.~~~####### args = parse_args() print('Called with args:\n', args) # Enables fixed seed if args.fixed_seed: np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) if cfg.CUDA: torch.cuda.manual_seed_all(cfg.RNG_SEED) torch.backends.cudnn.deterministic = True cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) check_and_overwrite_params(cfg, args) assert_and_infer_cfg() #indentificantion of a specific running args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args) save_training_config(output_dir, cfg, args) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) else: tblogger = None #######~~~.Dataset.~~~####### timers = defaultdict(Timer) # Roi datasets timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ########~~~.Model.~~~####### model = eval(args.model).loot_model(args) if cfg.CUDA: model.cuda() optimizer = OptimizerHandler(model, cfg) load_ckpt(args.load_ckpt, model, optimizer) #######~~~.Training Loop.~~~####### try: # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / (cfg.NUM_GPUS * cfg.TRAIN.ITERATION_SIZE)) training_stats = TrainingStats(args, tblogger) model.train() logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): optimizer.update_learning_rate(step) training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(cfg.TRAIN.ITERATION_SIZE): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) # input_data['inner_iter'] = torch.tensor((inner_iter)) model.set_inner_iter(step) im_data = input_data['data'][0].cuda() rois = input_data['rois'][0].cuda().type(im_data.dtype) labels = input_data['labels'][0].cuda().type(im_data.dtype) net_outputs = model(im_data, rois, labels) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward(retain_graph=True) optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, optimizer.get_lr()) if (step + 1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, model, optimizer) # Training ends, saves the last checkpoint save_ckpt(output_dir, args, step, train_size, model, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, model, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = GetRCNNModel() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name + '.weight') gn_param_nameset.add(name + '.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN }] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) if cfg.TRAIN_SYNC_BN: # Shu:For synchorinized BN patch_replication_callback(maskRCNN) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if cfg.MODEL.LR_VIEW_ON or cfg.MODEL.GIF_ON or cfg.MODEL.LRASY_MAHA_ON: if key != 'roidb' and key != 'data': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map(Variable, input_data[key])) if key == 'data': input_data[key] = [ torch.squeeze(item) for item in input_data[key] ] input_data[key] = list( map(Variable, input_data[key])) else: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step + 1) % CHECKPOINT_PERIOD == 0: net_utils.train_save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint net_utils.train_save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') net_utils.train_save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 # ADE20k as a detection dataset elif args.dataset == "ade_train": cfg.TRAIN.DATASETS = ('ade_train', ) cfg.MODEL.NUM_CLASSES = 446 # Noisy CS6+WIDER datasets elif args.dataset == 'cs6_noise020+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise020', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise050+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise050', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise080+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise080', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise085+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise085', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise090+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise090', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise095+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise095', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise100+WIDER': cfg.TRAIN.DATASETS = ('cs6_noise100', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 # Just Noisy CS6 datasets elif args.dataset == 'cs6_noise020': cfg.TRAIN.DATASETS = ('cs6_noise020', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise030': cfg.TRAIN.DATASETS = ('cs6_noise030', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise040': cfg.TRAIN.DATASETS = ('cs6_noise040', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise050': cfg.TRAIN.DATASETS = ('cs6_noise050', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise060': cfg.TRAIN.DATASETS = ('cs6_noise060', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise070': cfg.TRAIN.DATASETS = ('cs6_noise070', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise080': cfg.TRAIN.DATASETS = ('cs6_noise080', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise085': cfg.TRAIN.DATASETS = ('cs6_noise085', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise090': cfg.TRAIN.DATASETS = ('cs6_noise090', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise095': cfg.TRAIN.DATASETS = ('cs6_noise095', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'cs6_noise100': cfg.TRAIN.DATASETS = ('cs6_noise100', ) cfg.MODEL.NUM_CLASSES = 2 # Cityscapes 7 classes elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_train', ) cfg.MODEL.NUM_CLASSES = 8 # BDD 7 classes elif args.dataset == "bdd_any_any_any": cfg.TRAIN.DATASETS = ('bdd_any_any_any_train', ) cfg.MODEL.NUM_CLASSES = 8 elif args.dataset == "bdd_any_any_daytime": cfg.TRAIN.DATASETS = ('bdd_any_any_daytime_train', ) cfg.MODEL.NUM_CLASSES = 8 elif args.dataset == "bdd_clear_any_daytime": cfg.TRAIN.DATASETS = ('bdd_clear_any_daytime_train', ) cfg.MODEL.NUM_CLASSES = 8 # Cistyscapes Pedestrian sets elif args.dataset == "cityscapes_peds": cfg.TRAIN.DATASETS = ('cityscapes_peds_train', ) cfg.MODEL.NUM_CLASSES = 2 # Cityscapes Car sets elif args.dataset == "cityscapes_cars_HPlen3+kitti_car_train": cfg.TRAIN.DATASETS = ('cityscapes_cars_HPlen3', 'kitti_car_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes_cars_HPlen5+kitti_car_train": cfg.TRAIN.DATASETS = ('cityscapes_cars_HPlen5', 'kitti_car_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes_car_train+kitti_car_train": cfg.TRAIN.DATASETS = ('cityscapes_car_train', 'kitti_car_train') cfg.MODEL.NUM_CLASSES = 2 # KITTI Car set elif args.dataset == "kitti_car_train": cfg.TRAIN.DATASETS = ('kitti_car_train', ) cfg.MODEL.NUM_CLASSES = 2 # BDD pedestrians sets elif args.dataset == "bdd_peds": cfg.TRAIN.DATASETS = ('bdd_peds_train', ) # bdd peds: clear_any_daytime cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "bdd_peds_full": cfg.TRAIN.DATASETS = ('bdd_peds_full_train', ) # bdd peds: any_any_any cfg.MODEL.NUM_CLASSES = 2 # Pedestrians with constraints elif args.dataset == "bdd_peds_not_clear_any_daytime": cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train', ) cfg.MODEL.NUM_CLASSES = 2 # Ashish's 20k samples videos elif args.dataset == "bdd_peds_not_clear_any_daytime_20k": cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_20k_train', ) cfg.MODEL.NUM_CLASSES = 2 # Source domain + Target domain detections elif args.dataset == "bdd_peds+DETS_20k": cfg.TRAIN.DATASETS = ('bdd_peds_dets_20k_target_domain', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 # Source domain + Target domain detections -- same 18k images as HP18k elif args.dataset == "bdd_peds+DETS18k": cfg.TRAIN.DATASETS = ('bdd_peds_dets18k_target_domain', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 # Only Dets elif args.dataset == "DETS20k": cfg.TRAIN.DATASETS = ('bdd_peds_dets_20k_target_domain', ) cfg.MODEL.NUM_CLASSES = 2 # Only Dets18k - same images as HP18k elif args.dataset == 'DETS18k': cfg.TRAIN.DATASETS = ('bdd_peds_dets18k_target_domain', ) cfg.MODEL.NUM_CLASSES = 2 # Only HP elif args.dataset == 'HP': cfg.TRAIN.DATASETS = ('bdd_peds_HP_target_domain', ) cfg.MODEL.NUM_CLASSES = 2 # Only HP 18k videos elif args.dataset == 'HP18k': cfg.TRAIN.DATASETS = ('bdd_peds_HP18k_target_domain', ) cfg.MODEL.NUM_CLASSES = 2 # Source domain + Target domain HP elif args.dataset == 'bdd_peds+HP': cfg.TRAIN.DATASETS = ('bdd_peds_train', 'bdd_peds_HP_target_domain') cfg.MODEL.NUM_CLASSES = 2 # Source domain + Target domain HP 18k videos cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k': cfg.TRAIN.DATASETS = ('bdd_peds_HP18k_target_domain', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 #### Source domain + Target domain with different conf threshold theta #### elif args.dataset == 'bdd_peds+HP18k_thresh-050': cfg.TRAIN.DATASETS = ('bdd_HP18k_thresh-050', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_thresh-060': cfg.TRAIN.DATASETS = ('bdd_HP18k_thresh-060', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_thresh-070': cfg.TRAIN.DATASETS = ('bdd_HP18k_thresh-070', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_thresh-090': cfg.TRAIN.DATASETS = ('bdd_HP18k_thresh-090', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 ############################## #### Data distillation on BDD -- for rebuttal elif args.dataset == 'bdd_peds+bdd_data_dist_small': cfg.TRAIN.DATASETS = ('bdd_data_dist_small', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+bdd_data_dist_mid': cfg.TRAIN.DATASETS = ('bdd_data_dist_mid', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+bdd_data_dist': cfg.TRAIN.DATASETS = ('bdd_data_dist', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 ############################## #### Source domain + **Labeled** Target domain with varying number of images elif args.dataset == 'bdd_peds+labeled_100': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_100', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+labeled_075': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_075', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+labeled_050': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_050', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+labeled_025': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_025', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+labeled_010': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_010', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+labeled_005': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_005', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+labeled_001': cfg.TRAIN.DATASETS = ('bdd_peds_not_clear_any_daytime_train_001', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 ############################## # Source domain + Target domain HP tracker bboxes only elif args.dataset == 'bdd_peds+HP18k_track_only': cfg.TRAIN.DATASETS = ('bdd_HP18k_track_only', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 ##### subsets of bdd_HP18k with different constraints # Source domain + HP tracker images at NIGHT elif args.dataset == 'bdd_peds+HP18k_any_any_night': cfg.TRAIN.DATASETS = ('bdd_HP18k_any_any_night', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_rainy_any_daytime': cfg.TRAIN.DATASETS = ('bdd_HP18k_rainy_any_daytime', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_rainy_any_night': cfg.TRAIN.DATASETS = ('bdd_HP18k_rainy_any_night', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_overcast,rainy_any_daytime': cfg.TRAIN.DATASETS = ('bdd_HP18k_overcast,rainy_any_daytime', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_overcast,rainy_any_night': cfg.TRAIN.DATASETS = ('bdd_HP18k_overcast,rainy_any_night', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_overcast,rainy,snowy_any_daytime': cfg.TRAIN.DATASETS = ('bdd_HP18k_overcast,rainy,snowy_any_daytime', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 ############# end of bdd constraned subsets ##################### # Source domain + Target domain HP18k -- after histogram matching elif args.dataset == 'bdd_peds+HP18k_remap_hist': cfg.TRAIN.DATASETS = ('bdd_peds_HP18k_target_domain_remap_hist', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+HP18k_remap_cityscape_hist': cfg.TRAIN.DATASETS = ( 'bdd_peds_HP18k_target_domain_remap_cityscape_hist', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 # Source domain + Target domain HP18k -- after histogram matching elif args.dataset == 'bdd_peds+HP18k_remap_random': cfg.TRAIN.DATASETS = ('bdd_peds_HP18k_target_domain_remap_random', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 # Source+Noisy Target domain -- prevent domain adv from using HP roi info elif args.dataset == 'bdd_peds+bdd_HP18k_noisy_100k': cfg.TRAIN.DATASETS = ('bdd_HP18k_noisy_100k', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+bdd_HP18k_noisy_080': cfg.TRAIN.DATASETS = ('bdd_HP18k_noisy_080', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+bdd_HP18k_noisy_060': cfg.TRAIN.DATASETS = ('bdd_HP18k_noisy_060', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == 'bdd_peds+bdd_HP18k_noisy_070': cfg.TRAIN.DATASETS = ('bdd_HP18k_noisy_070', 'bdd_peds_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "wider_train": cfg.TRAIN.DATASETS = ('wider_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-subset": cfg.TRAIN.DATASETS = ('cs6-subset', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-subset-score": cfg.TRAIN.DATASETS = ('cs6-subset-score', ) elif args.dataset == "cs6-subset-gt": cfg.TRAIN.DATASETS = ('cs6-subset-gt', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-3013-gt": cfg.TRAIN.DATASETS = ('cs6-3013-gt', ) # DEBUG: overfit on one video annots cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-subset-gt+WIDER": cfg.TRAIN.DATASETS = ('cs6-subset-gt', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-subset+WIDER": cfg.TRAIN.DATASETS = ('cs6-subset', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-gt": cfg.TRAIN.DATASETS = ('cs6-train-gt', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-gt-noisy-0.3": cfg.TRAIN.DATASETS = ('cs6-train-gt-noisy-0.3', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-gt-noisy-0.5": cfg.TRAIN.DATASETS = ('cs6-train-gt-noisy-0.5', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-det-score": cfg.TRAIN.DATASETS = ('cs6-train-det-score', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-det-score-0.5": cfg.TRAIN.DATASETS = ('cs6-train-det-score-0.5', ) elif args.dataset == "cs6-train-det": cfg.TRAIN.DATASETS = ('cs6-train-det', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-det-0.5": cfg.TRAIN.DATASETS = ('cs6-train-det-0.5', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-hp": cfg.TRAIN.DATASETS = ('cs6-train-hp', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-easy-gt": cfg.TRAIN.DATASETS = ('cs6-train-easy-gt', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-easy-gt-sub": cfg.TRAIN.DATASETS = ('cs6-train-easy-gt-sub', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-easy-hp": cfg.TRAIN.DATASETS = ('cs6-train-easy-hp', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-easy-det": cfg.TRAIN.DATASETS = ('cs6-train-easy-det', ) cfg.MODEL.NUM_CLASSES = 2 # Joint training with CS6 and WIDER elif args.dataset == "cs6-train-easy-gt-sub+WIDER": cfg.TRAIN.DATASETS = ('cs6-train-easy-gt-sub', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-gt+WIDER": cfg.TRAIN.DATASETS = ('cs6-train-gt', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-hp+WIDER": cfg.TRAIN.DATASETS = ('cs6-train-hp', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-dummy+WIDER": cfg.TRAIN.DATASETS = ('cs6-train-dummy', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cs6-train-det+WIDER": cfg.TRAIN.DATASETS = ('cs6-train-det', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 # Dets dataset created by removing tracker results from the HP json elif args.dataset == "cs6_train_det_from_hp+WIDER": cfg.TRAIN.DATASETS = ('cs6_train_det_from_hp', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 # Dataset created by removing det results from the HP json -- HP tracker only elif args.dataset == "cs6_train_hp_tracker_only+WIDER": cfg.TRAIN.DATASETS = ('cs6_train_hp_tracker_only', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 # HP dataset with noisy labels: used to prevent DA from getting any info from HP elif args.dataset == "cs6_train_hp_noisy_100+WIDER": cfg.TRAIN.DATASETS = ('cs6_train_hp_noisy_100', 'wider_train') cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list( map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print( 'Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print( 'Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format( cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) """def _init_fn(worker_id): random.seed(999) np.random.seed(999) torch.cuda.manual_seed(999) torch.cuda.manual_seed_all(999) torch.manual_seed(999) torch.backends.cudnn.deterministic = True """ ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) if cfg.TRAIN.JOINT_TRAINING: if len(cfg.TRAIN.DATASETS) == 2: print('Joint training on two datasets') else: raise NotImplementedError joint_training_roidb = [] for i, dataset_name in enumerate(cfg.TRAIN.DATASETS): # ROIDB construction timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( (dataset_name), cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) if i == 0: roidb_size = len(roidb) batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) #worker_init_fn=_init_fn) # decrease num-threads when using two dataloaders dataiterator = iter(dataloader) joint_training_roidb.append({ 'dataloader': dataloader, 'dataiterator': dataiterator, 'dataset_name': dataset_name }) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) #worker_init_fn=init_fn) dataiterator = iter(dataloader) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size if cfg.TRAIN.JOINT_SELECTIVE_FG: orig_fg_batch_ratio = cfg.TRAIN.FG_FRACTION ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name + '.weight') gn_param_nameset.add(name + '.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN }] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_' + str( cfg.TRAIN.DATASETS) + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): """ random.seed(cfg.RNG_SEED) np.random.seed(cfg.RNG_SEED) torch.cuda.manual_seed(cfg.RNG_SEED) torch.cuda.manual_seed_all(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) torch.backends.cudnn.deterministic = True """ # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): # use a iter counter for optional alternating batches if args.iter_size == 1: iter_counter = step else: iter_counter = inner_iter if cfg.TRAIN.JOINT_TRAINING: # alternate batches between dataset[0] and dataset[1] if iter_counter % 2 == 0: if True: #DEBUG: print('Dataset: %s' % joint_training_roidb[0]['dataset_name']) dataloader = joint_training_roidb[0]['dataloader'] dataiterator = joint_training_roidb[0]['dataiterator'] # NOTE: if available FG samples cannot fill minibatch # then batchsize will be smaller than cfg.TRAIN.BATCH_SIZE_PER_IM. else: if True: #DEBUG: print('Dataset: %s' % joint_training_roidb[1]['dataset_name']) dataloader = joint_training_roidb[1]['dataloader'] dataiterator = joint_training_roidb[1]['dataiterator'] try: input_data = next(dataiterator) except StopIteration: # end of epoch for dataloader dataiterator = iter(dataloader) input_data = next(dataiterator) if cfg.TRAIN.JOINT_TRAINING: if iter_counter % 2 == 0: joint_training_roidb[0][ 'dataiterator'] = dataiterator else: joint_training_roidb[1][ 'dataiterator'] = dataiterator for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] # [p.data.get_device() for p in maskRCNN.parameters()] # [(name, p.data.get_device()) for name, p in maskRCNN.named_parameters()] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step + 1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
cfg.TRAIN.IMS_PER_BATCH = 1 if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) val_roidb, val_ratio_list, val_ratio_index = combined_roidb_for_training_semseg( cfg.VALIDATION.VAL_LIST) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) train_size = len(roidb) print("total samples to be evaluated:", train_size) val_size = len(val_roidb) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=False) dataloader = torch.utils.data.DataLoader( dataset, batch_size=1, sampler=sampler, shuffle=False, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch) for args.step, input_data, in zip(range(args.index_start, args.index_end), dataloader): #for key in input_data: #if key != 'roidb': # roidb is a list of ndarrays with inconsistent length # input_data[key] = list(map(lambda x: Variable(x, requires_grad=False).to('cuda'), input_data[key]))
def main(): """Main function""" if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") logger = utils.logging.setup_logging(__name__) args = parse_args() logger.info('Called with args:') logger.info(args) assert (torch.cuda.device_count() == 1) ^ bool(args.multi_gpu_testing) assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 'Exactly one of --load_ckpt and --load_detectron should be specified.' if args.output_dir is None: ckpt_path = args.load_ckpt if args.load_ckpt else args.load_detectron args.output_dir = os.path.join( os.path.dirname(os.path.dirname(ckpt_path)), 'test') logger.info('Automatically set output directory to %s', args.output_dir) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.close_fpn: args.cfg_file = "configs/few_shot/e2e_mask_rcnn_R-50-C4_1x_{}.yaml".format(args.group) else: args.cfg_file = "configs/few_shot/e2e_mask_rcnn_R-50-FPN_1x_{}.yaml".format(args.group) if args.cfg_file is not None: merge_cfg_from_file(args.cfg_file) if args.set_cfgs is not None: merge_cfg_from_list(args.set_cfgs) cfg.VIS = args.vis cfg.SEEN = args.seen if args.close_co_atten: cfg.CO_ATTEN = False if args.close_relation_rcnn: cfg.RELATION_RCNN = False if not args.close_fpn: cfg.FAST_RCNN.ROI_BOX_HEAD = 'fast_rcnn_heads.roi_2mlp_head' cfg.MRCNN.ROI_MASK_HEAD = 'mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs' else: cfg.FAST_RCNN.ROI_BOX_HEAD = 'torchResNet.ResNet_roi_conv5_head' cfg.MRCNN.ROI_MASK_HEAD = 'mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare' if args.deform_conv: cfg.MODEL.USE_DEFORM = True if args.dataset == "fis_cell": cfg.TEST.DATASETS = ('fis_cell_test',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "coco2017": cfg.TEST.DATASETS = ('coco_2017_val',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TEST.DATASETS = ('keypoints_coco_2017_val',) cfg.MODEL.NUM_CLASSES = 2 else: # For subprocess call assert cfg.TEST.DATASETS, 'cfg.TEST.DATASETS shouldn\'t be empty' assert_and_infer_cfg() #logger.info('Testing with config:') #logger.info(pprint.pformat(cfg)) # manually set args.cuda args.cuda = True timer_for_ds = defaultdict(Timer) ### Dataset ### timer_for_ds['roidb'].tic() imdb, roidb, ratio_list, ratio_index, query, cat_list = combined_roidb( cfg.TEST.DATASETS, False) timer_for_ds['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timer_for_ds['roidb'].average_time) batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index, shuffle=False), batch_size=1, drop_last=False ) dataset = RoiDataLoader( roidb, ratio_list, ratio_index, query, cfg.MODEL.NUM_CLASSES, training=False, cat_list=cat_list, shot=args.checkshot) ### Model ### model = initialize_model_from_cfg(args, gpu_id=0) all_results = OrderedDict({ 'box': OrderedDict( [ ('AP', []), ('AP50', []), ('AP75', []), ('APs', []), ('APm', []), ('APl', []), ] ), 'mask': OrderedDict( [ ('AP', []), ('AP50', []), ('AP75', []), ('APs', []), ('APm', []), ('APl', []), ] ) }) timer_for_total = defaultdict(Timer) timer_for_total['total_test_time'].tic() for avg in range(args.average): dataset.query_position = avg dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch ) dataiterator = iter(dataloader) num_images = len(ratio_index) num_cats = imdb.num_classes all_boxes, all_segms, all_keyps = empty_results(num_cats, num_images) # total quantity of testing images num_detect = len(ratio_index) timers = defaultdict(Timer) post_fix = '%dshot_g%d_seen%d_%d'%(args.checkshot, args.group, args.seen, avg) for i, index in enumerate(ratio_index): input_data = next(dataiterator) catgory = input_data['choice'] entry = dataset._roidb[dataset.ratio_index[i]] if cfg.TEST.PRECOMPUTED_PROPOSALS: # The roidb may contain ground-truth rois (for example, if the roidb # comes from the training or val split). We only want to evaluate # detection on the *non*-ground-truth rois. We select only the rois # that have the gt_classes field set to 0, which means there's no # ground truth. box_proposals = entry['boxes'][entry['gt_classes'] == 0] if len(box_proposals) == 0: continue else: # Faster R-CNN type models generate proposals on-the-fly with an # in-network RPN; 1-stage models don't require proposals. box_proposals = None #im = cv2.imread(entry['image']) im = imread(entry['image']) if len(im.shape) == 2: im = im[:,:,np.newaxis] im = np.concatenate((im,im,im), axis=2) alpha = 2.2 # Simple contrast control beta = 0.5 # Simple brightness control im_colored = im.copy() if catgory[0].item() == 7: im_colored[:,:,0] = 0 im_colored[:,:,1] = 0 im_colored = cv2.convertScaleAbs(im_colored, alpha=alpha, beta=beta) elif catgory[0].item() == 8: im_colored[:,:,1] = 0 im_colored[:,:,2] = 0 im_colored = cv2.convertScaleAbs(im_colored, alpha=alpha, beta=beta) elif catgory[0].item() == 6: im_colored[:,:,0] = 0 im_colored[:,:,2] = 0 im_colored = cv2.convertScaleAbs(im_colored, alpha=alpha, beta=beta) cls_boxes_i, cls_segms_i, cls_keyps_i = im_detect_all(model, im, input_data['query'], input_data['query_type'], catgory, num_cats, box_proposals, timers) im = im_colored extend_results(i, all_boxes, cls_boxes_i) if cls_segms_i is not None: extend_results(i, all_segms, cls_segms_i) if cls_keyps_i is not None: extend_results(i, all_keyps, cls_keyps_i) if i % 10 == 0: # Reduce log file size ave_total_time = np.sum([t.average_time for t in timers.values()]) eta_seconds = ave_total_time * (num_images - i - 1) eta = str(datetime.timedelta(seconds=int(eta_seconds))) det_time = ( timers['im_detect_bbox'].average_time + timers['im_detect_mask'].average_time + timers['im_detect_keypoints'].average_time ) misc_time = ( timers['misc_bbox'].average_time + timers['misc_mask'].average_time + timers['misc_keypoints'].average_time ) logger.info( ( 'im_detect: range [{:d}, {:d}] of {:d}: ' '{:d}/{:d} {:.3f}s + {:.3f}s (eta: {})' ).format( 1, num_detect, num_detect, i + 1, num_detect, det_time, misc_time, eta ) ) if cfg.VIS: im_name = entry['image'] class_name = im_name.split('/')[-4] file_name = im_name.split('/')[-3] im_target = im.copy() to_tensor = transforms.ToTensor() o_querys=[] for i in range(args.checkshot): o_query = input_data['query'][0][i][0].permute(1, 2,0).contiguous().cpu().numpy() o_query *= [0.229, 0.224, 0.225] o_query += [0.485, 0.456, 0.406] o_query *= 255 o_query_colored = o_query.copy() if catgory[0].item() == 7: o_query_colored[:,:,0] = 0 o_query_colored[:,:,1] = 0 o_query_colored = cv2.convertScaleAbs(o_query_colored, alpha=alpha, beta=beta) elif catgory[0].item() == 8: o_query_colored[:,:,1] = 0 o_query_colored[:,:,2] = 0 o_query_colored = cv2.convertScaleAbs(o_query_colored, alpha=alpha, beta=beta) elif catgory[0].item() == 6: o_query_colored[:,:,0] = 0 o_query_colored[:,:,2] = 0 o_query_colored = cv2.convertScaleAbs(o_query_colored, alpha=alpha, beta=beta) o_query = o_query_colored o_query = Image.fromarray(o_query.astype(np.uint8)) o_querys.append(to_tensor(o_query)) o_querys_grid = make_grid(o_querys, nrow=args.checkshot//2, normalize=True, scale_each=True, pad_value=1) o_querys_grid = transforms.ToPILImage()(o_querys_grid).convert("RGB") query_w, query_h = o_querys_grid.size query_bg = Image.new('RGB', (im_target.shape[1], im_target.shape[0]), (255, 255, 255)) bg_w, bg_h = query_bg.size offset = ((bg_w - query_w) // 2, (bg_h - query_h) // 2) query_bg.paste(o_querys_grid, offset) query = np.asarray(query_bg) im_pair = np.concatenate((im_target, query), axis=1) im_output_dir = os.path.join(args.output_dir, 'vis', post_fix, class_name) if not os.path.exists(im_output_dir): os.makedirs(im_output_dir) sample_output_dir = os.path.join(im_output_dir, os.path.basename('{:d}_{:s}'.format(i, file_name))) if not os.path.exists(sample_output_dir): os.makedirs(sample_output_dir) target_save_name = os.path.join(sample_output_dir, os.path.basename('{:d}_{:s}'.format(i, file_name)) + '_target.pdf') target = Image.fromarray(im_target.astype(np.uint8)) target.save(target_save_name,"pdf") query_save_name = os.path.join(sample_output_dir, os.path.basename('{:d}_{:s}'.format(i, file_name)) + '_query.pdf') query = Image.fromarray(query.astype(np.uint8)) query.save(query_save_name,"pdf") pred_save_name = os.path.join(sample_output_dir, os.path.basename('{:d}_{:s}'.format(i, file_name)) + '_pred.pdf') vis_utils.save_one_image( im, pred_save_name, cls_boxes_i, segms = cls_segms_i, keypoints = cls_keyps_i, thresh = cfg.VIS_TH, box_alpha = 0.6, dataset = imdb, show_class = False ) im_det = vis_utils.vis_one_image( im, '{:d}_det_{:s}'.format(i, file_name), os.path.join(args.output_dir, 'vis', post_fix), cls_boxes_i, segms = cls_segms_i, keypoints = cls_keyps_i, thresh = cfg.VIS_TH, box_alpha = 0.6, dataset = imdb, show_class = False, class_name = class_name, draw_bbox = False ) gt_save_name = os.path.join(sample_output_dir, os.path.basename('{:d}_{:s}'.format(i, file_name)) + '_gt.pdf') vis_utils.save_one_image_gt( im, entry['id'], gt_save_name, dataset = imdb) im_gt = vis_utils.vis_one_image_gt( im, entry['id'], '{:d}_gt_{:s}'.format(i, file_name), os.path.join(args.output_dir, 'vis', post_fix), dataset = imdb, class_name = class_name) im_det = np.asarray(im_det) im_gt = np.asarray(im_gt) im2draw = np.concatenate((im_gt, im_det), axis=1) im2show = np.concatenate((im_pair, im2draw), axis=0) im_save_name = os.path.basename('{:d}_{:s}'.format(i, file_name)) + '.png' cv2.imwrite(os.path.join(im_output_dir, '{}'.format(im_save_name)), cv2.cvtColor(im2show, cv2.COLOR_RGB2BGR)) """ if cfg.VIS: im_name = dataset._roidb[dataset.ratio_index[i]]['image'] class_name = im_name.split('/')[-4] file_name = im_name.split('/')[-3] im2show = Image.open(im_name).convert("RGB") o_query = input_data['query'][0][0][0].permute(1, 2, 0).contiguous().cpu().numpy() o_query *= [0.229, 0.224, 0.225] o_query += [0.485, 0.456, 0.406] o_query *= 255 o_query = o_query[:,:,::-1] o_query = Image.fromarray(o_query.astype(np.uint8)) query_w, query_h = o_query.size query_bg = Image.new('RGB', (im2show.size), (255, 255, 255)) bg_w, bg_h = query_bg.size offset = ((bg_w - query_w) // 2, (bg_h - query_h) // 2) query_bg.paste(o_query, offset) im2show = np.asarray(im2show) o_query = np.asarray(query_bg) im2show = np.concatenate((im2show, o_query), axis=1) output_dir = os.path.join(args.output_dir, 'vis') if not os.path.exists(output_dir): os.makedirs(output_dir) im_save_dir = os.path.join(output_dir, class_name) if not os.path.exists(im_save_dir): os.makedirs(im_save_dir) im_save_name = os.path.join(im_save_dir, file_name + '_%d_d.png'%(i)) cv2.imwrite(im_save_name, cv2.cvtColor(im2show, cv2.COLOR_RGB2BGR)) """ cfg_yaml = yaml.dump(cfg) #det_name = 'detections.pkl' det_file = os.path.join(args.output_dir, 'detections_' + post_fix + '.pkl') save_object( dict( all_boxes=all_boxes, all_segms=all_segms, all_keyps=all_keyps, cfg=cfg_yaml ), det_file ) logger.info('Wrote detections to: {}'.format(os.path.abspath(det_file))) results = task_evaluation.evaluate_all( imdb, all_boxes, all_segms, all_keyps, args.output_dir ) task_evaluation.check_expected_results( results, atol=cfg.EXPECTED_RESULTS_ATOL, rtol=cfg.EXPECTED_RESULTS_RTOL ) for task, metrics in all_results.items(): metric_names = metrics.keys() for metric_name in metric_names: all_results[task][metric_name].append(results[imdb.name][task][metric_name]) #task_evaluation.log_copy_paste_friendly_results(results) for task, metrics in all_results.items(): metric_names = metrics.keys() for metric_name in metric_names: values = all_results[task][metric_name] all_results[task][metric_name] = sum(values) / len(values) post_fix = '%dshot_g%d_seen%d'%(args.checkshot, args.group, args.seen) avg_results_path = os.path.join(args.output_dir, ('avg_cocoeval_' + post_fix + '_results.json')) with open(avg_results_path, 'w') as f: f.write(json.dumps(all_results)) timer_for_total['total_test_time'].toc() logger.info('Total inference time: {:.3f}s'.format(timer_for_total['total_test_time'].average_time))
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "coco2014": cfg.TRAIN.DATASETS = ('coco_2014_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'vcoco_trainval': cfg.TRAIN.DATASETS = ('vcoco_trainval', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'vcoco_train': cfg.TRAIN.DATASETS = ('vcoco_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'vcoco_val': cfg.TRAIN.DATASETS = ('vcoco_val', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'keypoints_coco2014': cfg.TRAIN.DATASETS = ('keypoints_coco_2014_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) if args.vcoco_kp_on: cfg.VCOCO.KEYPOINTS_ON = True cfg.NETWORK_NAME = args.net_name # network name print('Network name:', args.net_name) cfg.MODEL.CONV_BODY = args.conv_body # backbone network name print('Conv_body name:', args.conv_body) cfg.TRAIN.FG_THRESH = args.fg_thresh print('Train fg thresh:', args.fg_thresh) cfg.RESNETS.FREEZE_AT = args.freeze_at print('Freeze at: ', args.freeze_at) cfg.VCOCO.MLP_HEAD_DIM = args.mlp_head_dim print('MLP head dim: ', args.mlp_head_dim) cfg.SOLVER.MAX_ITER = args.max_iter print('MAX iter: ', args.max_iter) cfg.TRAIN.SNAPSHOT_ITERS = args.snapshot print('Snapshot Iters: ', args.snapshot) if args.solver_steps is not None: cfg.SOLVER.STEPS = args.solver_steps print('Solver_steps: ', cfg.SOLVER.STEPS) cfg.VCOCO.TRIPLETS_NUM_PER_IM = args.triplets_num_per_im print('triplets_num_per_im: ', cfg.VCOCO.TRIPLETS_NUM_PER_IM) cfg.VCOCO.HEATMAP_KERNEL_SIZE = args.heatmap_kernel_size print('heatmap_kernel_size: ', cfg.VCOCO.HEATMAP_KERNEL_SIZE) cfg.VCOCO.PART_CROP_SIZE = args.part_crop_size print('part_crop_size: ', cfg.VCOCO.PART_CROP_SIZE) print('use use_kps17 for part Align: ', args.use_kps17) if args.use_kps17: cfg.VCOCO.USE_KPS17 = True else: cfg.VCOCO.USE_KPS17 = False print('MULTILEVEL_ROIS: ', cfg.FPN.MULTILEVEL_ROIS) if args.vcoco_use_spatial: cfg.VCOCO.USE_SPATIAL = True if args.vcoco_use_union_feat: cfg.VCOCO.USE_UNION_FEAT = True if args.use_precomp_box: cfg.VCOCO.USE_PRECOMP_BOX = True cfg.DEBUG_TEST_WITH_GT = True if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH # 16 original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) print(' FG_THRESH: ', cfg.TRAIN.FG_THRESH) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list( map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) cfg.SOLVER.VAL_ITER = int(cfg.SOLVER.VAL_ITER * step_scale + 0.5) cfg.TRAIN.SNAPSHOT_ITERS = int(cfg.TRAIN.SNAPSHOT_ITERS * step_scale + 0.5) print( 'Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print( 'Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format( cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) # ipdb.set_trace() ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size # ToDo: shuffle? batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) # dataiterator = iter(dataloader) ### Model ### from modeling.model_builder import Generalized_RCNN maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_hoi_params = [] bias_hoi_param_names = [] bias_faster_params = [] bias_faster_param_names = [] nobias_hoi_params = [] nobias_hoi_param_names = [] nobias_faster_params = [] nobias_faster_param_names = [] # bias_params = [] # bias_param_names = [] # nonbias_params = [] # nonbias_param_names = [] #base_model = torch.load('Outputs/baseline/baseline_512_32_nogt_1o3/ckpt/model_step47999.pth') nograd_param_names = [] for key, value in maskRCNN.named_parameters(): #if key in base_model['model'].keys(): # value.requires_grad = False #print('the key xxx:', key) # Fix RPN module same as the paper # ToDo: or key.startswith('Box') # if 'affinity' not in key: # value.requires_grad = False print(key, value.size(), value.requires_grad) if value.requires_grad: if 'bias' in key: if 'HOI_Head' in key: bias_hoi_params.append(value) bias_hoi_param_names.append(key) else: bias_faster_params.append(value) bias_faster_param_names.append(key) else: if 'HOI_Head' in key: nobias_hoi_params.append(value) nobias_hoi_param_names.append(key) else: nobias_faster_params.append(value) nobias_faster_param_names.append(key) else: nograd_param_names.append(key) #del base_model #ipdb.set_trace() # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ { 'params': nobias_hoi_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': nobias_faster_params, 'lr': 0 * cfg.SOLVER.FASTER_RCNN_WEIGHT, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_hoi_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': bias_faster_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1) * cfg.SOLVER.FASTER_RCNN_WEIGHT, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) if args.krcnn_from_faster: net_utils.load_krcnn_from_faster(maskRCNN, checkpoint['model']) else: net_utils.load_ckpt(maskRCNN, checkpoint['model']) print('Original model loaded....') if args.resume: print('Resume, loaded step\n\n\n: ', checkpoint['step']) args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. optimizer.load_state_dict(checkpoint['optimizer']) # misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' #output_dir = misc_utils.get_output_dir(args, args.run_name) output_dir = os.path.join('Outputs', args.expDir, args.expID) os.makedirs(output_dir, exist_ok=True) args.cfg_filename = os.path.basename(args.cfg_file) tblogger = None if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) if args.expID.__contains__('base'): os.environ['FABRICATOR'] = 'base' else: os.environ['FABRICATOR'] = 'fcl' print('log', os.environ['FABRICATOR']) ### Training Loop ### train_val(maskRCNN, args, optimizer, lr, dataloader, train_size, output_dir, tblogger)
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") merge_cfg_from_file(args.cfg_file) # Some manual adjustment for the ApolloScape dataset parameters here cfg.OUTPUT_DIR = args.output_dir cfg.TRAIN.DATASETS = 'Car3D' cfg.MODEL.NUM_CLASSES = 8 if cfg.CAR_CLS.SIM_MAT_LOSS: cfg.MODEL.NUMBER_CARS = 79 else: # Loss is only cross entropy, hence, we detect only car categories in the training set. cfg.MODEL.NUMBER_CARS = 34 cfg.TRAIN.MIN_AREA = 196 # 14*14 cfg.TRAIN.USE_FLIPPED = False # Currently I don't know how to handle the flipped case cfg.TRAIN.IMS_PER_BATCH = 1 cfg.NUM_GPUS = torch.cuda.device_count() effective_batch_size = cfg.TRAIN.IMS_PER_BATCH * cfg.NUM_GPUS * args.iter_size ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size assert (args.batch_size % cfg.NUM_GPUS) == 0, 'batch_size: %d, NUM_GPUS: %d' % ( args.batch_size, cfg.NUM_GPUS) print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print( 'Adjust BASE_LR linearly according to batch_size change:\n BASE_LR: {} --> {}' .format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list( map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print( 'Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print( 'Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format( cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.MODEL.LOSS_3D_2D_ON: roidb, ratio_list, ratio_index, ds = combined_roidb_for_training( cfg.TRAIN.DATASETS, args.dataset_dir) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, args.dataset_dir) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True, valid_keys=[ 'has_visible_keypoints', 'boxes', 'seg_areas', 'gt_classes', 'gt_overlaps', 'box_to_gt_ind_map', 'is_crowd', 'car_cat_classes', 'poses', 'quaternions', 'im_info' ]) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, drop_last=True, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### if cfg.MODEL.LOSS_3D_2D_ON: maskRCNN = Generalized_RCNN(ds.Car3D) else: maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model'], ignore_list=args.ckpt_ignore_head) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: # TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' # output_dir = os.path.join('/media/SSD_1TB/zzy/ApolloScape/ECCV2018_apollo/train', args.run_name) output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) # warmup_factor_trans = 1.0 try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha # warmup_factor_trans = cfg.SOLVER.WARM_UP_FACTOR_TRANS * (1 - alpha) + alpha # warmup_factor_trans *= cfg.TRANS_HEAD.LOSS_BETA warmup_factor_trans = 1.0 else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len( cfg.SOLVER.STEPS ) and step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) net_outputs['losses'][ 'loss_car_cls'] *= cfg.CAR_CLS.CAR_CLS_LOSS_BETA net_outputs['losses']['loss_rot'] *= cfg.CAR_CLS.ROT_LOSS_BETA if cfg.MODEL.TRANS_HEAD_ON: net_outputs['losses'][ 'loss_trans'] *= cfg.TRANS_HEAD.TRANS_LOSS_BETA training_stats.UpdateIterStats_car_3d(net_outputs) # start training # loss_car_cls: 2.233790, loss_rot: 0.296853, loss_trans: ~100 loss = net_outputs['losses']['loss_car_cls'] + net_outputs[ 'losses']['loss_rot'] if cfg.MODEL.TRANS_HEAD_ON: loss += net_outputs['losses']['loss_trans'] if cfg.MODEL.LOSS_3D_2D_ON: loss += net_outputs['losses']['UV_projection_loss'] if not cfg.TRAIN.FREEZE_CONV_BODY and not cfg.TRAIN.FREEZE_RPN and not cfg.TRAIN.FREEZE_FPN: loss += net_outputs['total_loss_conv'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr, warmup_factor_trans) if (step + 1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()