def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from, resume_optimizer=cfg.resume_optimizer) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, **kwargs): logger = get_root_logger(cfg.log_level) # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, repeat_samples=cfg.train_cfg.repeat_samples, **kwargs) ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()]) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, #batch_processor=batch_processor, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger) #runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # logger) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(val_dataset_cfg)) else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg)) else: runner.register_hook(DistEvalmAPHook(val_dataset_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train_runner(model, trainDataset, valDataset, cfg, validate=False) -> Runner: # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = valDataset eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = getattr(mmdetDatasets, val_dataset_cfg.type) if issubclass(dataset_type, CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) return runner
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner # Hard-coding type of optimizer for now print('Training #Params: ', len(list(filter(lambda p: p.requires_grad, model.parameters())))) optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.optimizer.lr, momentum=cfg.optimizer.momentum, weight_decay=cfg.optimizer.weight_decay) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val)) # if validate: # if isinstance(model.module, RPN): # # TODO: implement recall hooks for other datasets # runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) # else: # if cfg.data.val.type == 'CocoDataset': # runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) # else: # runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, ) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner runner = Runner( model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level, ) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks( cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, ) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == "CocoDataset": runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) else: runner.load_checkpoint(tmp) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) shutil.copyfile(tmp, bw)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(val_dataset_cfg)) else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg)) else: runner.register_hook(DistEvalmAPHook(val_dataset_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # data_loaders = [ # build_dataloader( # ds, # cfg.data.imgs_per_gpu, # cfg.data.workers_per_gpu, # dist=True) # for ds in dataset # ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(val_dataset_cfg)) else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg)) else: runner.register_hook(DistEvalmAPHook(val_dataset_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, data_loaders, batch_processor, cfg): # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level, mean_teacher=cfg.mean_teacher) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) eval_hook = eval_cfg.pop('eval_hook', 'CocoDistEvalmAPHook') EvalHook = getattr(core, eval_hook) runner.register_hook(EvalHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if cfg.mean_teacher: runner.load_mean_teacher_checkpoint(cfg) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders try: pad_size = cfg.data.pad_size print("using padding size") except: pad_size = None data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, pad_size=pad_size, dist=True) ] rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() # put model on gpus model = MMDistributedDataParallel(model.cuda(rank % num_gpus)) torch.cuda.empty_cache() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == 'CocoDataset' or cfg.data.val.type == 'CocoZipDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # with torch.no_grad(): # for j in range(2): # print(j) # for i, data_batch in enumerate(data_loaders[0]): # _ = model(**data_batch) # # break # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == 'CocoDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()]) # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val)) # if validate: # if isinstance(model.module, RPN): # # TODO: implement recall hooks for other datasets # runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) # else: # if cfg.data.val.type == 'CocoDataset': # runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) # else: # runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def to_ddp(self, device_ids, dim, broadcast_buffers, find_unused_parameters, **kwargs): """Wrap models with separate MMDistributedDataParallel. It only wraps the modules with parameters. """ for name, module in self.module._modules.items(): if next(module.parameters(), None) is None: module = module.cuda() elif all(not p.requires_grad for p in module.parameters()): module = module.cuda() else: module = MMDistributedDataParallel( module.cuda(), device_ids=device_ids, dim=dim, broadcast_buffers=broadcast_buffers, find_unused_parameters=find_unused_parameters, **kwargs) self.module._modules[name] = module
def _dist_train(model, datasets, cfg, validate=False, logger=None): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for dataset in datasets ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner runner = NASRunner(model, batch_processor, None, cfg.work_dir, cfg.log_level, cfg=cfg, logger=logger) # register hooks weight_optim_config = DistOptimizerHook(**cfg.optimizer.weight_optim.optimizer_config) arch_optim_config = ArchDistOptimizerHook(**cfg.optimizer.arch_optim.optimizer_config) runner.register_training_hooks(cfg.lr_config, weight_optim_config, arch_optim_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.dataset_type == 'CocoDataset': # runner.register_hook(CocoDistEvalmAPHook_(datasets[1])) runner.register_hook(CocoDistEvalmAPHook(cfg.data.val_)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, cfg.arch_update_epoch)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # if just swa training is performed, # skip building the runner for the traditional training if not cfg.get('only_swa_training', False): # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: # if just swa training is performed, there should be a starting model assert cfg.swa_resume_from is not None or cfg.swa_load_from is not None # perform swa training # build swa training runner if not cfg.get('swa_training', False): return from mmdet.core import SWAHook logger.info('Start SWA training') swa_optimizer = build_optimizer(model, cfg.swa_optimizer) swa_runner = EpochBasedRunner(model, optimizer=swa_optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same swa_runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: swa_optimizer_config = Fp16OptimizerHook(**cfg.swa_optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.swa_optimizer_config: swa_optimizer_config = OptimizerHook(**cfg.swa_optimizer_config) else: swa_optimizer_config = cfg.swa_optimizer_config # register hooks swa_runner.register_training_hooks(cfg.swa_lr_config, swa_optimizer_config, cfg.swa_checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: swa_runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook swa_runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) swa_eval = True swa_eval_hook = eval_hook(val_dataloader, **eval_cfg) else: swa_eval = False swa_eval_hook = None # register swa hook swa_hook = SWAHook(swa_eval=swa_eval, eval_hook=swa_eval_hook) swa_runner.register_hook(swa_hook, priority='LOW') # register user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) swa_runner.register_hook(hook, priority=priority) if cfg.swa_resume_from: swa_runner.resume(cfg.swa_resume_from) elif cfg.swa_load_from: # use the best pretrained model as the starting model for swa training if cfg.swa_load_from == 'best_bbox_mAP.pth': best_model_path = os.path.join(cfg.work_dir, cfg.swa_load_from) assert os.path.exists(best_model_path) # avoid the best pretrained model being overwritten new_best_model_path = os.path.join(cfg.work_dir, 'best_bbox_mAP_pretrained.pth') os.rename(best_model_path, new_best_model_path) cfg.swa_load_from = new_best_model_path swa_runner.load_checkpoint(cfg.swa_load_from) swa_runner.run(data_loaders, cfg.workflow, cfg.swa_total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, test=dict(test_best=False, test_last=False), timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. test (dict): The testing option, with two keys: test_last & test_best. The value is True or False, indicating whether to test the corresponding checkpoint. Default: dict(test_best=False, test_last=False). timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) if cfg.omnisource: # The option can override videos_per_gpu train_ratio = cfg.data.get('train_ratio', [1] * len(dataset)) omni_videos_per_gpu = cfg.data.get('omni_videos_per_gpu', None) if omni_videos_per_gpu is None: dataloader_settings = [dataloader_setting] * len(dataset) else: dataloader_settings = [] for videos_per_gpu in omni_videos_per_gpu: this_setting = cp.deepcopy(dataloader_setting) this_setting['videos_per_gpu'] = videos_per_gpu dataloader_settings.append(this_setting) data_loaders = [ build_dataloader(ds, **setting) for ds, setting in zip(dataset, dataloader_settings) ] else: data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: if cfg.omnisource: runner.register_hook(OmniSourceDistSamplerSeedHook()) else: runner.register_hook(DistSamplerSeedHook()) # precise bn setting if cfg.get('precise_bn', False): precise_bn_dataset = build_dataset(cfg.data.train) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=1, # save memory and time num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) data_loader_precise_bn = build_dataloader(precise_bn_dataset, **dataloader_setting) precise_bn_hook = PreciseBNHook(data_loader_precise_bn, **cfg.get('precise_bn')) runner.register_hook(precise_bn_hook) if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook(val_dataloader, **eval_cfg) if distributed \ else EvalHook(val_dataloader, **eval_cfg) runner.register_hook(eval_hook) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner_kwargs = dict() if cfg.omnisource: runner_kwargs = dict(train_ratio=train_ratio) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs) if test['test_last'] or test['test_best']: best_ckpt_path = None if test['test_best']: if hasattr(eval_hook, 'best_ckpt_path'): best_ckpt_path = eval_hook.best_ckpt_path if best_ckpt_path is None or not osp.exists(best_ckpt_path): test['test_best'] = False if best_ckpt_path is None: runner.logger.info('Warning: test_best set as True, but ' 'is not applicable ' '(eval_hook.best_ckpt_path is None)') else: runner.logger.info('Warning: test_best set as True, but ' 'is not applicable (best_ckpt ' f'{best_ckpt_path} not found)') if not test['test_last']: return test_dataset = build_dataset(cfg.data.test, dict(test_mode=True)) gpu_collect = cfg.get('evaluation', {}).get('gpu_collect', False) tmpdir = cfg.get('evaluation', {}).get('tmpdir', osp.join(cfg.work_dir, 'tmp')) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {})) test_dataloader = build_dataloader(test_dataset, **dataloader_setting) names, ckpts = [], [] if test['test_last']: names.append('last') ckpts.append(None) if test['test_best']: names.append('best') ckpts.append(best_ckpt_path) for name, ckpt in zip(names, ckpts): if ckpt is not None: runner.load_checkpoint(ckpt) outputs = multi_gpu_test(runner.model, test_dataloader, tmpdir, gpu_collect) rank, _ = get_dist_info() if rank == 0: out = osp.join(cfg.work_dir, f'{name}_pred.pkl') test_dataset.dump_results(outputs, out) eval_cfg = cfg.get('evaluation', {}) for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule', 'by_epoch', 'broadcast_bn_buffers' ]: eval_cfg.pop(key, None) eval_res = test_dataset.evaluate(outputs, **eval_cfg) runner.logger.info(f'Testing results of the {name} checkpoint') for metric_name, val in eval_res.items(): runner.logger.info(f'{metric_name}: {val:.04f}')
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', True) # False # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=True, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, test=dict(test_best=False, test_last=False), timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. test (dict): The testing option, with two keys: test_last & test_best. The value is True or False, indicating whether to test the corresponding checkpoint. Default: dict(test_best=False, test_last=False). timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), persistent_workers=cfg.data.get('persistent_workers', False), num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) if cfg.omnisource: # The option can override videos_per_gpu train_ratio = cfg.data.get('train_ratio', [1] * len(dataset)) omni_videos_per_gpu = cfg.data.get('omni_videos_per_gpu', None) if omni_videos_per_gpu is None: dataloader_settings = [dataloader_setting] * len(dataset) else: dataloader_settings = [] for videos_per_gpu in omni_videos_per_gpu: this_setting = cp.deepcopy(dataloader_setting) this_setting['videos_per_gpu'] = videos_per_gpu dataloader_settings.append(this_setting) data_loaders = [ build_dataloader(ds, **setting) for ds, setting in zip(dataset, dataloader_settings) ] else: data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner runner = Runner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get( 'custom_hooks', None)) # multigrid setting multigrid_cfg = cfg.get('multigrid', None) if multigrid_cfg is not None: from mmaction.utils.multigrid import LongShortCycleHook multigrid_scheduler = LongShortCycleHook(cfg) runner.register_hook(multigrid_scheduler) logger.info('Finish register multigrid hook') # subbn3d aggregation is HIGH, as it should be done before # saving and evaluation from mmaction.utils.multigrid import SubBatchNorm3dAggregationHook subbn3d_aggre_hook = SubBatchNorm3dAggregationHook() runner.register_hook(subbn3d_aggre_hook, priority='VERY_HIGH') logger.info('Finish register subbn3daggre hook') # precise bn setting if cfg.get('precise_bn', False): precise_bn_dataset = build_dataset(cfg.data.train) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=1, # save memory and time persistent_workers=cfg.data.get('persistent_workers', False), num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) data_loader_precise_bn = build_dataloader(precise_bn_dataset, **dataloader_setting) precise_bn_hook = PreciseBNHook(data_loader_precise_bn, **cfg.get('precise_bn')) runner.register_hook(precise_bn_hook, priority='HIGHEST') logger.info('Finish register precisebn hook') if distributed: if cfg.omnisource: runner.register_hook(OmniSourceDistSamplerSeedHook()) else: runner.register_hook(DistSamplerSeedHook()) if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), persistent_workers=cfg.data.get('persistent_workers', False), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook(val_dataloader, **eval_cfg) if distributed \ else EvalHook(val_dataloader, **eval_cfg) runner.register_hook(eval_hook) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner_kwargs = dict() if cfg.omnisource: runner_kwargs = dict(train_ratio=train_ratio) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs) if distributed: dist.barrier() time.sleep(5) if test['test_last'] or test['test_best']: best_ckpt_path = None if test['test_best']: ckpt_paths = [x for x in os.listdir(cfg.work_dir) if 'best' in x] ckpt_paths = [x for x in ckpt_paths if x.endswith('.pth')] if len(ckpt_paths) == 0: runner.logger.info('Warning: test_best set, but no ckpt found') test['test_best'] = False if not test['test_last']: return elif len(ckpt_paths) > 1: epoch_ids = [ int(x.split('epoch_')[-1][:-4]) for x in ckpt_paths ] best_ckpt_path = ckpt_paths[np.argmax(epoch_ids)] else: best_ckpt_path = ckpt_paths[0] if best_ckpt_path: best_ckpt_path = osp.join(cfg.work_dir, best_ckpt_path) test_dataset = build_dataset(cfg.data.test, dict(test_mode=True)) gpu_collect = cfg.get('evaluation', {}).get('gpu_collect', False) tmpdir = cfg.get('evaluation', {}).get('tmpdir', osp.join(cfg.work_dir, 'tmp')) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', 1), workers_per_gpu=cfg.data.get('workers_per_gpu', 1), persistent_workers=cfg.data.get('persistent_workers', False), num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('test_dataloader', {})) test_dataloader = build_dataloader(test_dataset, **dataloader_setting) names, ckpts = [], [] if test['test_last']: names.append('last') ckpts.append(None) if test['test_best'] and best_ckpt_path is not None: names.append('best') ckpts.append(best_ckpt_path) for name, ckpt in zip(names, ckpts): if ckpt is not None: runner.load_checkpoint(ckpt) outputs = multi_gpu_test(runner.model, test_dataloader, tmpdir, gpu_collect) rank, _ = get_dist_info() if rank == 0: out = osp.join(cfg.work_dir, f'{name}_pred.pkl') test_dataset.dump_results(outputs, out) eval_cfg = cfg.get('evaluation', {}) for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule', 'by_epoch', 'broadcast_bn_buffers' ]: eval_cfg.pop(key, None) eval_res = test_dataset.evaluate(outputs, **eval_cfg) runner.logger.info(f'Testing results of the {name} checkpoint') for metric_name, val in eval_res.items(): runner.logger.info(f'{metric_name}: {val:.04f}')
class DistPersonExtractor(object): def __init__(self, weight_path): # build model weights = torch.load(weight_path, map_location='cpu') self.model = resnet50_person(weights) self.model = MMDistributedDataParallel( self.model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) self.model.eval() def batch_extract(self, imglist, img_prefix, imgs_per_gpu=1, workers_per_gpu=4): # build dataset dataset = PersonDataset(imglist, img_prefix=img_prefix) # get dist info rank, world_size = get_dist_info() # build data loader sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) data_loader = DataLoader(dataset, batch_size=imgs_per_gpu, sampler=sampler, num_workers=workers_per_gpu, collate_fn=partial( collate, samples_per_gpu=imgs_per_gpu), pin_memory=False) results = self.multi_gpu_test(data_loader) return results def multi_gpu_test(self, data_loader): results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = self.model(data) results.append(result.detach().cpu()) if rank == 0: batch_size = data.size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks results = self.collect_results(torch.cat(results), len(dataset)) return results def collect_results(self, result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() # pylint: disable=not-callable tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') # pylint: enable=not-callable dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir # mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) torch.save(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) # part_list.append(mmcv.load(part_file)) part_list.append(torch.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.append(torch.stack(res)) # the dataloader may pad some samples ordered_results = torch.cat(ordered_results) ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[ 'type'] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # `num_gpus` will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, runner_type=runner_type) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if 'runner' not in cfg: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. runner.register_hook( eval_hook(val_dataloader, **eval_cfg), priority='LOW') # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def train_segmentor(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Launch segmentor training.""" logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, drop_last=True) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # model.ddp = model else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # print(model) # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters} warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner(cfg.runner, default_args=dict(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # print(cfg.optimizer) # print(cfg.optimizer_config) optimizer_config = OptimizerHookLW(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow)
def main(): args = parse_args() if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.data.test.test_mode = True # pass arg of fcn testing if args.fcn_testing: cfg.model.update({'fcn_testing': True}) cfg.model['cls_head'].update({'fcn_testing': True}) # for regular testing if cfg.data.test.oversample == 'three_crop': cfg.model.spatial_temporal_module.spatial_size = 8 dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True)) if args.launcher == 'none': raise NotImplementedError("By default, we use distributed testing, so that launcher should be pytorch") else: distributed = True init_dist(args.launcher, **cfg.dist_params) model = build_recognizer(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=1, dist=distributed, shuffle=False) load_checkpoint(model, args.checkpoint, map_location='cpu') model = MMDistributedDataParallel(model.cuda()) outputs = multi_test(model, data_loader) rank, _ = get_dist_info() if args.out and rank == 0: print('writing results to {}'.format(args.out)) mmcv.dump(outputs, args.out) gt_labels = [] for i in range(len(dataset)): ann = dataset.get_ann_info(i) gt_labels.append(ann['label']) if args.use_softmax: print("Averaging score over {} clips with softmax".format(outputs[0].shape[0])) results = [softmax(res, dim=1).mean(axis=0) for res in outputs] else: print("Averaging score over {} clips without softmax (ie, raw)".format(outputs[0].shape[0])) results = [res.mean(axis=0) for res in outputs] top1, top5 = top_k_accuracy(results, gt_labels, k=(1,5)) mean_acc = mean_class_accuracy(results, gt_labels) print("Mean Class Accuracy = {:.02f}".format(mean_acc * 100)) print("Top-1 Accuracy = {:.02f}".format(top1 * 100)) print("Top-5 Accuracy = {:.02f}".format(top5 * 100))
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), # my repeat=getattr(cfg.data, 'repeat', 1)) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16 == True: model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, # my find_unused_parameters=True) # # build runner # my global Runner if cfg.workflow[0][0] is 'search': Runner = SearchRunner # runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): priority = hook.pop('priority', None) if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) if priority is None: runner.register_hook(build_hook(hook, common_params)) else: runner.register_hook(build_hook(hook, common_params), priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner print(cfg) print("-"*20) print(cfg.optimizer) optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.wandb: runner.register_hook( WandbLoggerHook(init_kwargs=dict( project=cfg.project, name=cfg.exp_name, config=cfg))) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # defrost backbone hook when_defrost = cfg.get('when_defrost') if when_defrost is not None: if when_defrost < 0: raise RuntimeError('when_defrost < 0') frozen_stages = cfg.get('frozen_stages', -1) defrost_backbone = DefrostBackbone(when_defrost, frozen_stages) runner.register_hook(defrost_backbone) # log hook custom_log = CustomLog(cfg.data.samples_per_gpu, when_defrost, os.path.join(cfg.work_dir, 'log.txt')) runner.register_hook(custom_log) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume( cfg.resume_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) elif cfg.load_from: runner.load_checkpoint( cfg.load_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), samples_per_epoch=cfg.data.get('samples_per_epoch', None), dist=distributed, seed=cfg.seed, persistent_workers=cfg.data.get('persistent_workers', False)) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) if find_unused_parameters: logger.info('set find_unused_parameters = True in DDP') # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) optimizer_config = cfg.optimizer_config if 'type' not in cfg.optimizer_config: optimizer_config.type = 'Fp16OptimizerHook' \ if fp16_cfg else 'OptimizerHook' if fp16_cfg: optimizer_config.update(fp16_cfg) if 'Fp16' in optimizer_config.type: optimizer_config.update(distributed=distributed) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, persistent_workers=cfg.data.get('persistent_workers', False)) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)