def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.deprecated.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank # this should be removed if we update BatchNorm stats #broadcast_buffers=False,find_unused_parameters=True ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, local_rank) return model
def train(cfg, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD warmup_layers = tuple(x for x in cfg.SOLVER.WARMUP_LAYERS if len(x) != 0) warmup_iters = cfg.SOLVER.WARMUP_ITERS do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, warmup_layers, warmup_iters ) return model
def fit(self, train_data, test_data, classes): self.classes = classes optimizer = make_optimizer(self.cfg, self.model) scheduler = make_lr_scheduler(self.cfg, optimizer) arguments = {} arguments["iteration"] = 0 self.checkpointer.classes = classes self.checkpointer.optimizer = optimizer self.checkpointer.scheduler = scheduler train_data_loader = make_data_loader( self.cfg, train_data, classes, is_train=True, is_distributed=False, start_iter=arguments["iteration"], ) test_data_loader = make_data_loader( self.cfg, test_data, classes, is_train=True, is_distributed=False, start_iter=arguments["iteration"], ) self.train_meter, self.test_meter = do_train( self.model, train_data_loader, test_data_loader, optimizer, scheduler, self.device, arguments, )
def train(cfg, local_rank, distributed, random_number_generator=None): if (torch._C, '_jit_set_profiling_executor') : torch._C._jit_set_profiling_executor(False) if (torch._C, '_jit_set_profiling_mode') : torch._C._jit_set_profiling_mode(False) # Model logging log_event(key=constants.GLOBAL_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) log_event(key=constants.NUM_IMAGE_CANDIDATES, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # Initialize mixed-precision training is_fp16 = (cfg.DTYPE == "float16") if is_fp16: # convert model to FP16 model.half() optimizer = make_optimizer(cfg, model) # Optimizer logging log_event(key=constants.OPT_NAME, value="sgd_with_momentum") log_event(key=constants.OPT_BASE_LR, value=cfg.SOLVER.BASE_LR) log_event(key=constants.OPT_LR_WARMUP_STEPS, value=cfg.SOLVER.WARMUP_ITERS) log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=cfg.SOLVER.WARMUP_FACTOR) log_event(key=constants.OPT_LR_DECAY_FACTOR, value=cfg.SOLVER.GAMMA) log_event(key=constants.OPT_LR_DECAY_STEPS, value=cfg.SOLVER.STEPS) log_event(key=constants.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN[0]) log_event(key=constants.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) scheduler = make_lr_scheduler(cfg, optimizer) # disable the garbage collection gc.disable() if distributed: model = DDP(model, delay_allreduce=True) arguments = {} arguments["iteration"] = 0 arguments["nhwc"] = cfg.NHWC output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC) arguments.update(extra_checkpoint_data) if is_fp16: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) log_end(key=constants.INIT_STOP) barrier() log_start(key=constants.RUN_START) barrier() data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], random_number_generator=random_number_generator, ) log_event(key=constants.TRAIN_SAMPLES, value=len(data_loader)) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) else: per_iter_callback_fn = None start_train_time = time.time() success = do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, cfg.DISABLE_REDUCED_LOGGING, per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print( "&&&& MLPERF METRIC THROUGHPUT={:.4f} iterations / s".format((arguments["iteration"] * cfg.SOLVER.IMS_PER_BATCH) / total_training_time) ) return model, success
def train(cfg, local_rank, distributed, ft_flag): # 导入模型 model = build_detection_model(cfg) # 获取实验跑的设备 device = torch.device(cfg.MODEL.DEVICE) # print('device', device) model.to(device) # 获取 optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 #if ft_flag: # scheduler = None if ft_flag: checkpointer = DetectronCheckpointer( cfg, model, optimizer, None, output_dir, save_to_disk ) else: checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) # print('cfg.MODEL.WEIGHTP:', cfg.MODEL.WEIGHT) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) # print('extra_checkpoint_data:', extra_checkpoint_data.state_dict()) # arguments["iteration"] = 50000 arguments["iteration"] = 0 if ft_flag else arguments["iteration"] data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, None, None, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) # arguments.update(extra_checkpoint_data) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD logger = logging.getLogger("maskrcnn_benchmark.trainer") if cfg.MODEL.META_ARCHITECTURE == 'AdaptionRCNN': logger.info('AdaptionRCNN trainer is adapted!') cross_do_train( cfg, model, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, distributed, ) elif cfg.MODEL.META_ARCHITECTURE == 'GeneralizedRCNN': logger.info('GeneralizedRCNN trainer is adapted!') data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) do_train( cfg, model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, distributed, ) return model
def train(total_cfg, local_rank, distributed): total_model = [] for i in reversed(range(len(total_cfg))): model = build_detection_model(total_cfg[i]) device = torch.device(total_cfg[i].MODEL.DEVICE) model.to(device) if total_cfg[i].MODEL.USE_SYNCBN: assert is_pytorch_1_1_0_or_later(), \ "SyncBatchNorm is only available in pytorch >= 1.1.0" model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) optimizer = make_optimizer(total_cfg[i], model) scheduler = make_lr_scheduler(total_cfg[i], optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = total_cfg[i].OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( total_cfg[i], model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(total_cfg[i].MODEL.WEIGHT) if i == 0: arguments.update(extra_checkpoint_data) total_model.append(model) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = total_cfg[0].SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(total_cfg[0], is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = total_cfg[0].SOLVER.CHECKPOINT_PERIOD if len(total_model)>1: params = sum([np.prod(p.size()) for p in total_model[1].parameters()]) print('Number of Parameters:{:5f}M'.format(params / 1e6)) params = sum([np.prod(p.size()) for p in total_model[0].parameters()]) print('teacher_model Number of Parameters:{:5f}M'.format(params / 1e6)) else: params = sum([np.prod(p.size()) for p in total_model[0].parameters()]) print('Number of Parameters:{:5f}M'.format(params / 1e6)) do_train( total_cfg, total_model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, args, ) return total_model[1]
def train(cfg, local_rank, distributed): # 生成模型 model = build_detection_model(cfg) # _C.MODEL.DEVICE = "cuda" device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # 调节lr,用上了预热学习率 scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' """ opt_level有四个值选择输入:"O0", "O1", "O2", "03" 00相当于原始的单精度训练。 01在大部分计算时采用半精度,但是所有的模型参数依然保持单精度,对于少数单精度较好的计算(如softmax)依然保持单精度。 02相比于01,将模型参数也变为半精度。 03基本等于最开始实验的全半精度的运算。 值得一提的是,不论在优化过程中,模型是否采用半精度,保存下来的模型均为单精度模型,能够保证模型在其他应用中的正常使用。 """ model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) # 多GPU运算 if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) # 加载checkpointer文件 extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) # 对datasets做一些操作,返回data_loaders data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None # _C.SOLVER.CHECKPOINT_PERIOD = 2500 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ) return model
def train(cfg, cfg_origial, local_rank, distributed): ## The one with modified number of classes model = build_detection_model(cfg) # cfg_origial = cfg.clone() # cfg_origial.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 # original_model = build_detection_model(cfg_origial) ## Original model with 81 classes # ## Let's load weights for old class! # save_dir = cfg.OUTPUT_DIR # checkpointer = DetectronCheckpointer(cfg_origial, original_model, save_dir=save_dir) # checkpointer.load(cfg_origial.MODEL.WEIGHT) # # pretrained_model_pth = "/network/home/bhattdha/.torch/models/_detectron_35861795_12_2017_baselines_e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT_output_train_coco_2014_train%3Acoco_2014_valminusminival_generalized_rcnn_model_final.pkl" # # These keys are to be removed which forms final layers of the network # removal_keys = ['roi_heads.box.predictor.cls_score.weight', 'roi_heads.box.predictor.cls_score.bias', 'roi_heads.box.predictor.bbox_pred.weight', 'roi_heads.box.predictor.bbox_pred.bias', 'roi_heads.mask.predictor.mask_fcn_logits.weight', 'roi_heads.mask.predictor.mask_fcn_logits.bias'] # model = _transfer_pretrained_weights(new_model, original_model, removal_keys) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # # Initialize mixed-precision training # use_mixed_precision = cfg.DTYPE == "float16" # amp_opt_level = 'O1' if use_mixed_precision else 'O0' # model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) # cfg.MODEL.WEIGHT = '/network/home/bhattdha/exp.pth' ## Model stored through surgery extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 arguments['phase'] = 1 arguments['plot_median'], arguments['plot_global_avg'] = defaultdict( list), defaultdict(list) output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if arguments['phase'] == 1: data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], phase=1, ) do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, training_phase=1, ) arguments["iteration"] = 0 arguments["phase"] = 2 data_loader_phase2 = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], phase=2, ) do_train( cfg, model, data_loader_phase2, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, training_phase=2, ) return model
def train(cfg, local_rank, distributed, resume, config_file): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR out_split = output_dir.split("/") tensorboard_dir = os.path.join("tensorboard", out_split[-1] if len(out_split[-1]) > 0 else out_split[-2]) print("tensorboard_dir:", tensorboard_dir) if not os.path.isdir(tensorboard_dir): os.makedirs(tensorboard_dir) else: if len(os.listdir(tensorboard_dir)) > 0: print("Remove previous tensorboard events...") os.system("rm " + tensorboard_dir + "/*") result_writer = SummaryWriter(tensorboard_dir) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) # print("resume:", resume) extra_checkpoint_data = checkpointer.load(resume == "True", cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) if resume == "False": arguments["iteration"] = 0 # print("arguments:", arguments.keys()) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, results_recorder=result_writer, config_file=config_file ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.MODEL.USE_SYNCBN: assert is_pytorch_1_1_0_or_later(), \ "SyncBatchNorm is only available in pytorch >= 1.1.0" model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) logger = logging.getLogger("maskrcnn_benchmark.train") logger.info("The train model: \n {}".format(model)) device = torch.device(cfg.MODEL.DEVICE) if cfg.SOLVER.USE_SYNC_BN: model = apex.parallel.convert_syncbn_model(model) model.to(device) optimizer = make_optimizer(cfg, model) model, optimizer = amp.initialize(model, optimizer, opt_level="O0") scheduler = make_lr_scheduler(cfg, optimizer) if distributed: # model = torch.nn.parallel.DistributedDataParallel( # model, device_ids=[local_rank], output_device=local_rank, # # this should be removed if we update BatchNorm stats # #broadcast_buffers=False, # ) model = DDP(model, delay_allreduce=True) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, mode=0, resolution=None, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) data_loader.collate_fn.special_deal = False checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD writer, arch_writer = setup_writer(output_dir, get_rank()) if arch_writer is not None: arch_writer.write('Genotype: {}\n'.format(cfg.SEARCH.DECODER.CONFIG)) arch_writer.close() do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, writer, ) return model
def train(cfg, local_rank, distributed): # Model logging print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value=cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # Optimizer logging print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) else: per_iter_callback_fn = None start_train_time = time.time() do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, per_iter_start_callback_fn=functools.partial( mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print("&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format( (arguments["iteration"] * 1.0) / total_training_time)) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) # 梦开始的地方 device = torch.device(cfg.MODEL.DEVICE) # !!!!! model.to(device) for name, value in model.backbone.body.network.named_children( ): # 冻结主干网络参数 if int(name) > 60: for param in value.parameters(): param.requires_grad = False optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" # 这里可以改成float16来加速 amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # if use_amp: # # Initialize mixed-precision training # use_mixed_precision = cfg.DTYPE == "float16" # amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) # # wrap the optimizer for mixed precision # if cfg.SOLVER.ACCUMULATE_GRAD: # # also specify number of steps to accumulate over # optimizer = amp_handle.wrap_optimizer(optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS) # else: # optimizer = amp_handle.wrap_optimizer(optimizer) model, optimizer = amp.initialize(model, optimizer,opt_level='O1') if distributed: if use_apex_ddp: model = DDP(model, delay_allreduce=True) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if 1==1: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MIN_BBOX_MAP, min_segm_map=cfg.MIN_MASK_MAP) else: per_iter_callback_fn = None do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, per_iter_end_callback_fn=per_iter_callback_fn, ) return model
def train(cfg, local_rank, distributed, use_tensorboard=False, logger=None): arguments = {"iteration": 0} data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.SOLVER.UNFREEZE_CONV_BODY: for p in model.backbone.parameters(): p.requires_grad = True optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, logger=logger) print(cfg.TRAIN.IGNORE_LIST) extra_checkpoint_data = checkpointer.load( cfg.MODEL.WEIGHT, ignore_list=cfg.TRAIN.IGNORE_LIST) arguments.update(extra_checkpoint_data) if cfg.SOLVER.KEEP_LR: optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tensorboard_logdir = cfg.OUTPUT_DIR tensorboard_exp_name = cfg.TENSORBOARD_EXP_NAME snapshot = cfg.SOLVER.SNAPSHOT_ITERS do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, snapshot, tensorboard_logdir, tensorboard_exp_name, use_tensorboard=use_tensorboard) return model
def train(cfg, random_number_generator, local_rank, distributed, args, fp16=False): data_loader = make_data_loader(cfg, is_train=True, is_distributed=distributed) # todo sharath - undocument log below after package is updated # print_mlperf(key=mlperf_log.INPUT_SIZE, value=len(data_loader.dataset)) print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.DATALOADER.IMAGES_PER_BATCH_TRAIN) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.DATALOADER.IMAGES_PER_BATCH_TEST) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value=cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) model = build_detection_model(cfg) load_from_pretrained_checkpoint(cfg, model) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) max_iter = cfg.SOLVER.MAX_ITER if use_apex_amp: amp_handle = amp.init(enabled=fp16, verbose=False) if cfg.SOLVER.ACCUMULATE_GRAD: # also specify number of steps to accumulate over optimizer = amp_handle.wrap_optimizer( optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS) else: optimizer = amp_handle.wrap_optimizer(optimizer) if distributed: if use_apex_ddp: model = DDP(model, delay_allreduce=True) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) arguments = {} arguments["iteration"] = 0 arguments["use_amp"] = use_apex_amp output_dir = cfg.OUTPUT_DIR if cfg.SAVE_CHECKPOINTS: checkpoint_file = cfg.CHECKPOINT checkpointer = Checkpoint(model, optimizer, scheduler, output_dir, local_rank) if checkpoint_file: extra_checkpoint_data = checkpointer.load(checkpoint_file) arguments.update(extra_checkpoint_data) else: checkpointer = None do_train( model, data_loader, optimizer, scheduler, checkpointer, max_iter, device, distributed, arguments, cfg, args, random_number_generator, ) return model
def train(cfg, local_rank, distributed, model_config=None, use_tensorboard=True): model = build_detection_model(cfg, model_config) if get_rank() == 0: if 'search' in cfg.MODEL.BACKBONE.CONV_BODY: print('backbone search space:', blocks_key) else: print('backbone:', cfg.MODEL.BACKBONE) if 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR or 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH: print('head search space:', head_ss_keys) else: print('head:', cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR, cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH) if 'search' in cfg.MODEL.INTER_MODULE.NAME: print('inter search space:', inter_ss_keys) else: print('inter:', cfg.MODEL.INTER_MODULE.NAME) print(model) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer, lr_dict = make_optimizer(cfg, model) if get_rank() == 0: for item in lr_dict: print(item) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training if not ('search' in cfg.MODEL.BACKBONE.CONV_BODY or 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR or 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH): use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True) # if 'search' in cfg.MODEL.BACKBONE.CONV_BODY: # def forward_hook(module: Module, inp: (Tensor,)): # if module.weight is not None: # module.weight.requires_grad = True # if module.bias is not None: # module.bias.requires_grad = True # all_modules = (nn.Conv2d, nn.Linear, nn.BatchNorm2d, nn.GroupNorm, ) # group norm更新!! # for m in model.modules(): # if isinstance(m, all_modules): # m.register_forward_pre_hook(forward_hook) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if use_tensorboard: meters = TensorboardLogger(cfg=cfg, log_dir=cfg.TENSORBOARD_EXPERIMENT, start_iter=arguments['iteration'], delimiter=" ") else: meters = MetricLogger(delimiter=" ") do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, meters, ) return model
def train(cfg, local_rank, distributed, use_tensorboard=False): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if use_tensorboard: meters = TensorboardLogger( log_dir=cfg.TENSORBOARD_EXPERIMENT, stage = 'train', start_iter=arguments['iteration'], delimiter=" ") meters_val = TensorboardLogger( log_dir=cfg.TENSORBOARD_EXPERIMENT, stage = 'val', start_iter=arguments['iteration'], delimiter=" ") else: meters = MetricLogger(delimiter=" ") meters_val = MetricLogger(delimiter=" ") do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, meters, meters_val, ) return model
def train(cfg, local_rank, distributed): # Model logging print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value = cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/modeling/detector/detectors.py # building bare mode without doing anthing model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # Optimizer logging print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR print("output_dir "+str(output_dir)) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) # no such SAVE_CHECKPOINTS #arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS arguments["save_checkpoints"] = False extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"] ) print("SSY iters_per_epoch "+str(iters_per_epoch)) #print("SSY iters_per_epoch change to 100 ") #iters_per_epoch = 100 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch # SSY # I already add PER_EPOCH_EVAL and MIN_BBOX_MAP MIN_SEGM_MAP to ./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml # but it still can not find it # so I manually set them here #if cfg.PER_EPOCH_EVAL: # per_iter_callback_fn = functools.partial( # mlperf_test_early_exit, # iters_per_epoch=iters_per_epoch, # tester=functools.partial(test, cfg=cfg), # model=model, # distributed=distributed, # min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, # min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) #else: # per_iter_callback_fn = None per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/tester.py tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=0.377, min_segm_map=0.339) start_train_time = time.time() # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/trainer.py do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print( "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time) ) return model
def train(cfg, local_rank, distributed, fp16, dllogger): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) use_amp = False if fp16: use_amp = True else: use_amp = cfg.DTYPE == "float16" if distributed: if use_apex_ddp: model = DDP(model, delay_allreduce=True) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg, dllogger=dllogger), model=model, distributed=distributed, min_bbox_map=cfg.MIN_BBOX_MAP, min_segm_map=cfg.MIN_MASK_MAP) else: per_iter_callback_fn = None do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, dllogger, per_iter_end_callback_fn=per_iter_callback_fn, ) return model, iters_per_epoch
def train(cfg, local_rank, distributed): # original = torch.load('/home/zoey/nas/zoey/github/maskrcnn-benchmark/checkpoints/renderpy150000/model_0025000.pth') # # new = {"model": original["model"]} # torch.save(new, '/home/zoey/nas/zoey/github/maskrcnn-benchmark/checkpoints/finetune/model_0000000.pth') # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) # if cfg.MODEL.DEPTH_ON == True: # model_depth = build_detection_model(cfg) # device = torch.device(cfg.MODEL.DEVICE) # model_depth.to(device) # optimizer_depth = make_optimizer(cfg, model_depth) # scheduler_depth = make_lr_scheduler(cfg, optimizer_depth) # model_depth, optimizer_depth = amp.initialize(model_depth, optimizer_depth, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, logger=None, isrgb=True, isdepth=True) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) # extra_checkpoint_data = checkpointer.load('/home/zoey/nas/zoey/github/maskrcnn-benchmark/checkpoints/renderpy150000/model_0025000.pth') arguments.update(extra_checkpoint_data) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments) return model
def train(cfg, local_rank, distributed, test_while_training): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # ipdb.set_trace() scheduler = make_lr_scheduler(cfg, optimizer) logger = logging.getLogger("train_main_script") arguments = {} arguments["iteration"] = 0 arguments['start_save_ckpt'] = cfg.SOLVER.START_SAVE_CHECKPOINT ## define the output dir output_dir = cfg.OUTPUT_DIR checkpoint_output_dir = os.path.join(output_dir, 'checkpoints') save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, checkpoint_output_dir, save_to_disk) arguments['instance_id'] = output_dir.split('/')[-1] if len(cfg.MODEL.USE_DET_PRETRAIN) > 0: checkpointer.load_weight_partially(cfg.MODEL.USE_DET_PRETRAIN) elif len(cfg.MODEL.WEIGHT) > 0: extra_checkpoint_data, ckpt_name = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) # logger.info(str(model)) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True, ) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) if test_while_training: logger.info("test_while_training on ") val_data_loader = make_data_loader(cfg, is_train=False, is_distributed=distributed) else: logger.info("test_while_training off ") val_data_loader = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, distributed, val_data_loader, ) return model
def train(cfg, args, DatasetCatalog=None): if len(cfg.DATASETS.TRAIN) == 0 or not args.train: return None local_rank = args.local_rank distributed = args.distributed model = build_detection_model(cfg) # for key, value in model.named_parameters(): # print(key, value.requires_grad) if hasattr(args, 'train_last_layer'): if args.train_last_layer: listofkeys = [ 'cls_score.bias', 'cls_score.weight', 'bbox_pred.bias', 'bbox_pred.weight', 'mask_fcn_logits.bias', 'mask_fcn_logits.weight' ] for key, value in model.named_parameters(): value.requires_grad = False for k in listofkeys: if k in key: value.requires_grad = True # for key, value in model.named_parameters(): # print(key, value.requires_grad) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training if cfg.MODEL.DEVICE == 'cuda': use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk) extra_checkpoint_data = checkpointer.load( cfg.MODEL.WEIGHT, force_load_external_checkpoint=False, copy_weight_from_head_box=args.copy_weight_from_head_box) arguments = {} arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, args, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], DatasetCatalog=DatasetCatalog, ) if cfg.SOLVER.TEST_PERIOD > 0: data_loader_val = make_data_loader( cfg, args, is_train=False, is_distributed=distributed, is_for_period=True, start_iter=arguments["iteration"], DatasetCatalog=DatasetCatalog, ) else: data_loader_val = None do_train( model, cfg, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, cfg.SOLVER.CHECKPOINT_PERIOD, cfg.SOLVER.TEST_PERIOD, arguments, cfg.OUTPUT_DIR, args.visualize_loss, args.vis_title, args.iters_per_epoch, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # load a reid model reid_model = build_reid_model(cfg) reid_model.to(device) print('#######loading from {}#######'.format(cfg.REID.TEST.WEIGHT)) f = torch.load( cfg.REID.TEST.WEIGHT, map_location=torch.device("cpu"), ) if 'model' in f: load_state_dict(reid_model, f['model']) else: reid_model.load_state_dict(f, strict=False) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # model, optimizer = amp.initialize(model, optimizer, # opt_level="O0" # ) if distributed: model = DDP(model, delay_allreduce=True) # model = torch.nn.parallel.DistributedDataParallel( # model, device_ids=[local_rank], output_device=local_rank, # # this should be removed if we update BatchNorm stats # broadcast_buffers=False, # ) arguments = {} arguments["iteration"] = 0 num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 output_dir = os.path.join( cfg.OUTPUT_DIR, cfg.SUBDIR, 'GPU' + str(num_gpus) + '_LR' + str(cfg.SOLVER.BASE_LR)) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( reid_model, model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed, random_number_generator=None): # Model logging mlperf_print(key=constants.GLOBAL_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) mlperf_print(key=constants.NUM_IMAGE_CANDIDATES, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # Optimizer logging # mlperf_print(key=constants.OPT_NAME, value="sgd_with_momentum") mlperf_print(key=constants.OPT_BASE_LR, value=cfg.SOLVER.BASE_LR) mlperf_print(key=constants.OPT_LR_WARMUP_STEPS, value=cfg.SOLVER.WARMUP_ITERS) mlperf_print(key=constants.OPT_LR_WARMUP_FACTOR, value=cfg.SOLVER.WARMUP_FACTOR) scheduler = make_lr_scheduler(cfg, optimizer) # disable the garbage collection gc.disable() # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" # amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize( model, optimizer, opt_level=amp_opt_level) # , verbose=cfg.AMP_VERBOSE) if distributed: model = DDP(model, delay_allreduce=True) arguments = {} arguments["iteration"] = 0 arguments["nhwc"] = cfg.NHWC output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC) arguments.update(extra_checkpoint_data) # At this point we've loaded relevant checkpoint(s) and can now cast # FrozenBatchNorm2d layers to half() if necessary. # This allows us to move parameter casting logic out of the BN code itself, # which was preventing us from annotating bn.forward with @script_method, which # was preventing the cross-module fusion of BN with ReLU / Add-ReLU. if use_mixed_precision: model = cast_frozen_bn_to_half(model) mlperf_print(key=constants.INIT_STOP, sync=True) mlperf_print(key=constants.RUN_START, sync=True) barrier() data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], random_number_generator=random_number_generator, ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) else: per_iter_callback_fn = None start_train_time = time.time() success = do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, cfg.DISABLE_REDUCED_LOGGING, per_iter_start_callback_fn=functools.partial( mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print("&&&& MLPERF METRIC THROUGHPUT={:.4f} iterations / s".format( (arguments["iteration"] * cfg.SOLVER.IMS_PER_BATCH) / total_training_time)) return model, success
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.deprecated.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) logger = logging.getLogger("Training") with tools.TimerBlock("Loading Experimental setups", logger) as block: exp_name = cfg.EXP.NAME output_dir = tools.get_exp_output_dir(exp_name, cfg.OUTPUT_DIR) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD validation_period = cfg.SOLVER.VALIDATION_PERIOD with tools.TimerBlock("Loading Checkpoints...", logger) as block: arguments = {} save_to_disk = local_rank == 0 checkpointer = Checkpointer( model, save_dir=output_dir, save_to_disk=save_to_disk, num_class=cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES, ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) arguments["iteration"] = 0 with tools.TimerBlock("Initializing DAVIS Datasets", logger) as block: logger.info("Loading training set...") data_loader_train = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) logger.info("Loading valid set...") data_loaders_valid = make_data_loader( cfg, is_train=False, is_distributed=distributed, ) do_train( model, data_loader_train, data_loaders_valid[0], optimizer, scheduler, checkpointer, device, checkpoint_period, validation_period, arguments, exp_name, ) return model
def train(cfg, local_rank, distributed): model_det = build_detection_model(cfg) model_G = model_det.backbone # model_G = copy.deepcopy(model_det).backbone # model_D = define_D(256, 64, which_model_netD='det', n_layers_D=5) model_D = DEBUG_DESC(256, 64, n_layers=5, use_sigmoid=True) models = [model_det, model_G, model_D] device = torch.device(cfg.MODEL.DEVICE) for model in models: model.to(device) optimizer_det = make_optimizer(cfg, model_det) scheduler_det = make_lr_scheduler(cfg, optimizer_det) # optimizer_G = make_optimizer_Adam(cfg, model_det.backbone) optimizer_G = make_optimizer(cfg, model_det.backbone) scheduler_G = None optimizer_D = make_optimizer_Adam(cfg, model_D) # optimizer_D = make_optimizer(cfg, model_D) scheduler_D = None optimizers = [optimizer_det, optimizer_D, optimizer_G] schedulers = [scheduler_det, scheduler_D, scheduler_G] if distributed: for i, model in enumerate(models): models[i] = torch.nn.parallel.deprecated.DistributedDataParallel( models[i], device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} manual_iter = 0 print("WARNING! MANUAL ITERATION IS", manual_iter) arguments["iteration"] = manual_iter output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer_det = DetectronCheckpointer( cfg, model_det, optimizer_det, scheduler_det, output_dir, save_to_disk ) checkpointer_D = Checkpointer( model_D, optimizer_D, None, output_dir, save_to_disk ) checkpointers = [checkpointer_det, checkpointer_D] print('WARNING! REMOVED "iteration" from train_net.py') extra_checkpoint_data = checkpointer_det.load(cfg.MODEL.WEIGHT) extra_checkpoint_data = {"iteration" : 0} arguments.update(extra_checkpoint_data) data_loaders = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) arguments["need_adapt"] = False arguments["need_train_A"] = False arguments["need_train_B"] = True do_train( models, data_loaders, data_loaders_val, optimizers, schedulers, checkpointers, device, checkpoint_period, arguments, ) return model