def _non_dist_train(model, dataset, cfg, validate=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config if cfg.lr_config.get('customized', False): ### cfg.lr_config = eval(cfg.lr_config['policy'])(**cfg.lr_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # 确认dataset是一个list或者tuple # template_dataset = template_dataset if isinstance(template_dataset, (list, tuple)) else [template_dataset] # 确认dataset是一个list或者tuple # load the data for per batch data_loaders = [ # 调用该函数可实现每次返回minibatch张图片 build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # 将模型送到GPU中,how to set data into gpus # build runner optimizer = build_optimizer(model, cfg.optimizer) # 建立优化器 runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # 建立辅助Runner用来跑模型,进入后调用里面的train函数 cfg.log_level) # fp16 setting # 应当是关于FPN层的设置 fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: # 选择恢复训练或者重新加载模型 runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) # 调用runner的run方法进行训练
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range( cfg.gpus)).cuda() #for multiple GPU # refer : torch.nn.DataParallel(model) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # runner : https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/runner.py # fp16 setting fp16_cfg = cfg.get('fp16', None) # print('fp16_cfg:',fp16_cfg) # None if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: #default! optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config if cfg.model.type in ['TTFNet', 'CenterNet']: runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.bbox_head_hist_config, cfg.log_config) model_type = cfg.bbox_head_hist_config.model_type # ['ConvModule', 'DeformConvPack'] sub_module = cfg.bbox_head_hist_config.sub_modules # ['bbox_head'] save_steps = cfg.bbox_head_hist_config.save_every_n_steps class ActivationHook(): def __init__(self, runner, every_steps=1): self.runner = runner self.every_steps = every_steps self.steps = 0 self.activation = {} def hook(self, model, input, output): if (self.steps % self.every_steps) == 0: self.runner.module_name.append(self) self.runner.features_in_hook.append(input[0]) self.runner.features_out_hook.append(output) self.steps += 1 runner_hook = ActivationHook(runner, every_steps=save_steps) def module_to_list(module, lst): # remove sequence_and_modulelist mod_name = type(module).__name__ if isinstance(module, nn.ModuleList): return [module_to_list(item, lst) for item in module] else: if isinstance(module, nn.Sequential) and type(module[0]).__name__ in lst: module.register_forward_hook(save_feature_maps) for module in sub_module: if getattr(model.module, module, None): mod = getattr(model.module, module, None) for k, v in mod._modules.items(): sub_mod = getattr(mod, k) out = module_to_list(sub_mod, model_type) else: runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # defrost backbone hook when_defrost = cfg.get('when_defrost') if when_defrost is not None: if when_defrost < 0: raise RuntimeError('when_defrost < 0') frozen_stages = cfg.get('frozen_stages', -1) defrost_backbone = DefrostBackbone(when_defrost, frozen_stages) runner.register_hook(defrost_backbone) # log hook custom_log = CustomLog(cfg.data.samples_per_gpu, when_defrost, os.path.join(cfg.work_dir, 'log.txt')) runner.register_hook(custom_log) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume( cfg.resume_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) elif cfg.load_from: runner.load_checkpoint( cfg.load_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', True) # False # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=True, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'use_img_sampling' not in cfg.data: cfg.data.update({'use_img_sampling': False}) if 'use_sample_out' not in cfg.data: cfg.data.update({'use_sample_out': False}) print('--Dist-train--IS:{}--ISout:{}'.format(cfg.data.use_img_sampling, cfg.data.use_sample_out)) data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, use_img_sampling=cfg.data.use_img_sampling, use_sample_out=cfg.data.use_sample_out) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # Add for LVIS by LiYu import logging runner.logger.setLevel(logging.INFO) # ==================== # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] #print("im data loader ^^^^^^^*********$$$$$$, ",data_loaders) #print("data len is ", len(data_loaders[0])) #for i,d in enumerate(data_loaders[0]): # if i > 20: # print("ok break$$$$$$$$$") # break # print(d) # print("current i " , i) # #print("OK ,fine") # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner finetune_htc = cfg.get('finetune_htc', False) freeze_htc = cfg.get('freeze_htc', False) freeze_backbone = cfg.get('freeze_backbone', False) optimizer = build_optimizer(model, cfg.optimizer, finetune_htc=finetune_htc, freeze_htc=freeze_htc, freeze_backbone=freeze_backbone) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) #for name, param in model.named_parameters(): # print(name, param.requires_grad) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, runner_attr_dict=dict()): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # build runner runner_attr_dict.update({ 'imgs_per_gpu': cfg.data.imgs_per_gpu, 'initial_lr': cfg.optimizer['lr'] }) if hasattr(dataset, 'CLASSES'): runner_attr_dict.update({'classes': dataset.CLASSES}) optimizer = build_optimizer(model, cfg.optimizer) search_optimizer = getattr(getattr(cfg, 'search_config', {}), 'search_optimizer', None) assert search_optimizer is None, "Not support" runner = Runner(model, batch_processor, optimizer, search_optimizer, cfg.work_dir, logger=logger, runner_attr_dict=runner_attr_dict) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config) # register eval hooks if validate: if isinstance(model.module, RPN): runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == 'CocoDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) register_hooks(runner, cfg) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, runner_attr_dict=dict()): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # build runner runner_attr_dict.update({ 'imgs_per_gpu': cfg.data.imgs_per_gpu, 'initial_lr': cfg.optimizer['lr'] }) if hasattr(dataset, 'CLASSES'): runner_attr_dict.update({'classes': dataset.CLASSES}) optimizer = build_optimizer(model, cfg.optimizer) search_optimizer = getattr(cfg, 'search_config', {}).pop('search_optimizer', None) runner = Runner(model, batch_processor, optimizer, search_optimizer, cfg.work_dir, logger=logger, runner_attr_dict=runner_attr_dict) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config) runner.register_hook(DistSamplerSeedHook()) if search_optimizer is not None: runner.register_hook(DistSearchOptimizerHook()) runner.register_hook(SearchHook(**cfg.search_config)) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) register_hooks(runner, cfg) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, save_random_weights=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # Load Weights with different Modes if 'pretrain_weights' in cfg: model = partial_load(model, cfg.pretrain_weights, ignore_clf=cfg.ignore_clf, samenclasses=cfg.same_nclasses, convert_dict=cfg.convert_dict if 'convert_dict' in cfg else {}) elif 'trained_weights' in cfg: model.load_state_dict(torch.load(cfg['trained_weights'])['state_dict']) if 'freeze_vars' in cfg: model = freeze_model_partially(model, cfg.freeze_vars) # Save Full config copy in checkpoint dir if not os.path.exists(cfg.work_dir): os.makedirs(cfg.work_dir) config_file = open(os.path.join(cfg.work_dir, 'config.txt'), 'w') for key, value in cfg.items(): config_file.write('%s:%s\n'%(key, value)) repo = Repo('./') config_file.write('\nRepo Branch: %s, Commit: %s\n'%(repo.active_branch, repo.head.commit.hexsha)) config_file.close() # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_smpl_detector_fuse(model, datasets, cfg, **kwargs): # prepare data loaders data_loaders = [ build_dataloader_fuse(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for dataset in datasets ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config # TODO: Build up a logger here that inherit the hook class runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) # No need to add hook for validation. # We shall return the losses inside the loss function. # I won't re-use the code for the Evaluation Hook. # The evaluation result will be passed to log only. # To crete a `latest.pth` file to run recursively if kwargs.get('create_dummy', False): print('Create a dummy checkpoint for recursive training') runner._epoch -= 1 runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth') runner._epoch += 1 return pretrain_path = kwargs.get('load_pretrain', None) if kwargs.get('load_pretrain', None): print(f"Load pretrained model from {pretrain_path}") runner._epoch -= 1 runner.load_checkpoint(pretrain_path) runner.save_checkpoint(cfg.work_dir, filename_tmpl='pretrained_{}.pth') runner._epoch += 1 return if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) val_every = cfg.data.train.get('val_every', None) val_loader = None if val_every and val_every > 0: val_loader = build_dataloader_fuse(kwargs.get('val_dataset'), cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) if kwargs.get('debug', None): import ipdb ipdb.set_trace() runner.run(data_loaders, cfg.workflow, cfg.total_epochs, val_every=val_every, val_loader=val_loader, time_limit=getattr(cfg, 'time_limit', None))
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # # if model.module.bbox_head.freeze_solov2_and_train_combonly: # if model.module.bbox_head.optimize_list is not None: # for (key, param) in model.named_parameters(): # # if 'kernel_convs_convcomb' not in key and 'context_fusion_convs' not in key and 'learned_weight' not in key: # if not any(s in key for s in model.module.bbox_head.optimize_list): # param.requires_grad=False # else: # # print('optimize {}'.format(key)) # logger.info('optimize {}'.format(key)) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) ## add test after training if cfg.data.test.ann_file != 'data/lvis/lvis_v0.5_val_lvis_freqset.json': # if val set is lvis freq, only eval on lvis-freq val set cfg.data.test.test_mode = True dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) model_orig=model.module model = MMDataParallel(model, device_ids=[0]).cuda() data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100] outputs = single_gpu_test(model, data_loader) print('\nwriting results to {}'.format('xxx')) # mmcv.dump(outputs, 'xxx') eval_types = ['segm'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = 'xxx' coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json_segm(dataset, outputs, 'xxx', dump=False) coco_eval(result_files, eval_types, dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = 'xxx' + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file, dump=False) coco_eval(result_files, eval_types, dataset.coco) ##eval on lvis-77###### cfg.data.test.ann_file = 'data/lvis/lvis_v0.5_val_cocofied.json' cfg.data.test.img_prefix = 'data/lvis/val2017/' cfg.data.test.test_mode = True dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # model_orig=model.module # model = MMDataParallel(model, device_ids=[0]).cuda() data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100] outputs = single_gpu_test(model, data_loader) print('\nwriting results to {}'.format('xxx')) # mmcv.dump(outputs, 'xxx') eval_types = ['segm'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = 'xxx' coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json_segm(dataset, outputs, 'xxx', dump=False) from lvis import LVISEval lvisEval = LVISEval('data/lvis/lvis_v0.5_val_cocofied.json', result_files, 'segm') lvisEval.run() lvisEval.print_results() # fix lvis api eval iou_thr error, should be 0.9 but was 0.8999 lvisEval.params.iou_thrs[8] = 0.9 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]: print('AP at iou {}: {}'.format(iou, lvisEval._summarize('ap', iou_thr=iou))) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = 'xxx' + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file, dump=False) coco_eval(result_files, eval_types, dataset.coco) else: ##eval on lvis-freq###### cfg.data.test.test_mode = True dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # model_orig=model.module # model = MMDataParallel(model, device_ids=[0]).cuda() data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100] outputs = single_gpu_test(model, data_loader) print('\nwriting results to {}'.format('xxx')) # mmcv.dump(outputs, 'xxx') eval_types = ['segm'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = 'xxx' coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json_segm(dataset, outputs, 'xxx', dump=False) from lvis import LVISEval lvisEval = LVISEval(cfg.data.test.ann_file, result_files, 'segm') lvisEval.run() lvisEval.print_results() # fix lvis api eval iou_thr error, should be 0.9 but was 0.8999 lvisEval.params.iou_thrs[8] = 0.9 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]: print('AP at iou {}: {}'.format(iou, lvisEval._summarize('ap', iou_thr=iou))) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = 'xxx' + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file, dump=False) coco_eval(result_files, eval_types, dataset.coco)
def train_adv_smpl_detector(model, datasets, cfg, **kwargs): # prepare data loaders data_loaders = [ build_dataloader_fuse(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for dataset in datasets ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() adv_model = nn.DataParallel(Discriminator()).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) adv_optimizer = build_optimizer(adv_model, cfg.adv_optimizer) global runner runner = AdvRunner(adv_model, adv_optimizer, model, adv_batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: assert NotImplementedError( "AdvOptimizer is not implemented for fp16 yet") optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config # TODO: Build up a logger here that inherit the hook class # import mmcv.runner.hooks.logger as mmcv_logger # mmcv_logger.LoggerHook runner.register_training_hooks(cfg.adv_optimizer_config, cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) # No need to add hook for validation. # We shall return the losses inside the loss function. # I won't re-use the code for the Evaluation Hook. # The evaluation result will be passed to log only. # To crete a `latest.pth` file to run recursively if kwargs.get('create_dummy', False): print('Create a dummy checkpoint for recursive training') runner._epoch -= 1 runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth') runner._epoch += 1 return pretrain_path = kwargs.get('load_pretrain', None) if kwargs.get('load_pretrain', None): print(f"Load pretrained model from {pretrain_path}") runner._epoch -= 1 runner.load_checkpoint(pretrain_path) adv_pretrain_path = osp.join(*osp.split(pretrain_path)[:-1], 'adv_' + osp.split(pretrain_path)[-1]) if osp.isfile(adv_pretrain_path): runner.load_adv_checkpoint(adv_pretrain_path) else: print('No adversarial checkpoint is found.') runner.save_checkpoint(cfg.work_dir, filename_tmpl='pretrained_{}.pth') runner._epoch += 1 return if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) val_every = cfg.data.train.get('val_every', None) val_loader = None if val_every and val_every > 0: val_loader = build_dataloader_fuse(kwargs.get('val_dataset'), cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) if kwargs.get('debug', None): import ipdb ipdb.set_trace() re_weight = cfg.re_weight if hasattr(cfg, 're_weight') else dict() mosh_path = cfg.common_train_cfg.mosh_path mosh_data = np.load(mosh_path) mosh = { 'shape': mosh_data['shape'].copy(), 'pose': mosh_data['pose'].copy() } runner.run(data_loaders, cfg.workflow, cfg.total_epochs, val_every=val_every, val_loader=val_loader, time_limit=getattr(cfg, 'time_limit', None), re_weight=re_weight, mosh=mosh, log_grad=cfg.get('log_grad', False))
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] map_location = 'default' if torch.cuda.is_available(): if distributed: # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) else: model = MMDataCPU(model) map_location = 'cpu' # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) add_logging_on_first_and_last_iter(runner) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from, map_location=map_location) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) # runner = Runner( # model, # batch_processor, # optimizer, # cfg.work_dir, # logger=logger, # meta=meta) runner = Runner( # change for prune sparsity-regularization train model, batch_processor, optimizer, cfg, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks( cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # Register default hooks for training if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if validate: cfg.data.test.test_mode = True distributed = False dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) checkpoint_path = os.path.join(cfg.work_dir, 'latest.pth') checkpoint = load_checkpoint(model, checkpoint_path, map_location='cpu') if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) results = dataset.evaluate(outputs, 'keypoints') path = os.path.join(cfg.work_dir, 'keypoints.json') with open(path, 'a') as f: json.dump(results, f) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.wandb: runner.register_hook( WandbLoggerHook(init_kwargs=dict( project=cfg.project, name=cfg.exp_name, config=cfg))) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector_m(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] params_b = [] """for key, value in dict(model.backbone.named_parameters()).items(): if value.requires_grad: params_b += [{'params': [value]}] for key, value in dict(model.rpn_head.named_parameters()).items(): if value.requires_grad: params_b += [{'params': [value]}]""" for key, value in dict( model.roi_head.shared_head.named_parameters()).items(): if value.requires_grad: params_b += [{'params': [value]}] for key, value in dict( model.roi_head.bbox_head.named_parameters()).items(): if value.requires_grad: params_b += [{'params': [value]}] optimizer_b = torch.optim.SGD(params_b, lr=cfg.optimizer.lr, momentum=cfg.optimizer.momentum, weight_decay=cfg.optimizer.weight_decay) optimizer_g = None optimizer_d = None if model.roi_head.with_fsr_generator: params_g = [] for key, value in dict( model.roi_head.fsr_generator.named_parameters()).items(): if value.requires_grad: params_g += [{'params': [value]}] for key, value in dict( model.roi_head.shared_head.named_parameters()).items(): if value.requires_grad: params_g += [{'params': [value], 'lr': cfg.optimizer_b.lr}] for key, value in dict( model.roi_head.bbox_head.named_parameters()).items(): if value.requires_grad: params_g += [{'params': [value], 'lr': cfg.optimizer_b.lr}] optimizer_g = torch.optim.SGD( params_g, lr=cfg.optimizer_g.lr, momentum=cfg.optimizer_g.momentum, weight_decay=cfg.optimizer_g.weight_decay) if model.roi_head.with_dis_head: params_d = [] for key, value in dict( model.roi_head.dis_head.named_parameters()).items(): if value.requires_grad: params_d += [{'params': [value]}] optimizer_d = torch.optim.SGD( params_d, lr=cfg.optimizer_d.lr, momentum=cfg.optimizer_d.momentum, weight_decay=cfg.optimizer_d.weight_decay) # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner runner = MultiOptimRunner(model, optimizer_b, optimizer_g, optimizer_d, cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config_b = Fp16OptimizerHook(**cfg.optimizer_config_b, **fp16_cfg, distributed=distributed) optimizer_config_g = Fp16OptimizerHook(**cfg.optimizer_config_g, **fp16_cfg, distributed=distributed) optimizer_config_d = Fp16OptimizerHook(**cfg.optimizer_config_d, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config_b: optimizer_config_b = OptimHookB(**cfg.optimizer_config_b) optimizer_config_g = OptimHookG(**cfg.optimizer_config_g) optimizer_config_d = OptimHookD(**cfg.optimizer_config_d) else: optimizer_config_b = cfg.optimizer_config_b optimizer_config_g = cfg.optimizer_config_g optimizer_config_d = cfg.optimizer_config_d # register hooks runner.register_training_hooks(cfg.lr_config_b, cfg.lr_config_g, cfg.lr_config_d, optimizer_config_b, optimizer_config_g, optimizer_config_d, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train_runner(model, trainDataset, valDataset, cfg, validate=False) -> Runner: # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) before = toSingleGPUModeBefore after = toSingleGPUModeAfter # register eval hooks if validate: val_dataset_cfg = valDataset eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( HookWrapper( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg), before, after)) else: dataset_type = getattr(mmdetDatasets, val_dataset_cfg.type) if issubclass(dataset_type, mmdetDatasets.CocoDataset): runner.register_hook( HookWrapper( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg), before, after)) else: runner.register_hook( HookWrapper(DistEvalmAPHook(val_dataset_cfg, **eval_cfg), before, after)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: weightsPath = cfg.load_from if weightsPath is not None: if weightsPath.startswith("open-mmlab://"): cfgName = weightsPath[len("open-mmlab://"):] weightsPath = checkpoint_registry.getPath(cfgName) if cfg.resetHeads: torchHome = torch.hub._get_torch_home() chpName = os.path.basename(weightsPath)[0:(-1) * len(".pth")] noHeadWeightsPath = os.path.join( torchHome, f"checkpoints/nohead/{chpName}_nohead.pth") if not os.path.exists(noHeadWeightsPath): if isURL(weightsPath): weights = model_zoo.load_url(weightsPath) else: weights = torch.load(weightsPath) weights['state_dict'] = { k: v for k, v in weights['state_dict'].items() if not k.startswith('bbox_head') and not k.startswith('mask_head') } weightsDir = os.path.dirname(noHeadWeightsPath) if not os.path.exists(weightsDir): os.mkdir(weightsDir) torch.save(weights, noHeadWeightsPath) weightsPath = noHeadWeightsPath runner.load_checkpoint(weightsPath) return runner
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpu_ids), dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner if cfg.optimizer.type == 'SGD_GC': optimizer = SGD_GC(model.parameters(), cfg.optimizer.lr, momentum=cfg.optimizer.momentum, weight_decay=cfg.optimizer.weight_decay) else: optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(EvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) classes_rearrange = cfg.get('classes_rearrange', False) if classes_rearrange: runner.model = rearrange_classes(runner.model, cfg.classes, cfg.dataset_type) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # if validate: # raise NotImplementedError('Built-in validation is not implemented ' # 'yet in not-distributed training. Use ' # 'distributed training or test.py and ' # '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=True, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, multitask=False, vis=False): # prepare data loaders data_loaders = [[ build_dataloader(d, cfg.data.imgs_per_gpu // 2 if issubclass(d.__class__, datasets.BDDVideo) else cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu // 2 if issubclass(d.__class__, datasets.BDDVideo) else cfg.data.workers_per_gpu, dist=True) for d in dataset ]] if multitask else [ build_dataloader(dataset, cfg.data.imgs_per_gpu // 2 if issubclass(dataset.__class__, datasets.BDDVideo) else cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu // 2 if issubclass(dataset.__class__, datasets.BDDVideo) else cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) bp = batch_processor_with_vis if vis else batch_processor runner = MTLRunner(model, bp, optimizer, cfg.work_dir, cfg.log_level) if multitask else \ Runner(model, bp, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: if not type(val_dataset_cfg) == list: val_dataset_cfg = [val_dataset_cfg] for _cfg in val_dataset_cfg: dataset_type = getattr(datasets, _cfg.type) if issubclass(dataset_type, datasets.BddStreet) or \ issubclass(dataset_type, datasets.BddSemanticSeg): runner.register_hook(BddSegEvalHook(_cfg, **eval_cfg)) elif issubclass(dataset_type, datasets.CocoDataset): runner.register_hook(CocoDistEvalmAPHook(_cfg, **eval_cfg)) elif issubclass(dataset_type, datasets.BDDVideo): runner.register_hook(BDDEvalHook(_cfg)) else: runner.register_hook(DistEvalmAPHook(_cfg, **eval_cfg)) runner.register_logger_hooks(cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if cfg.get('init_asso_head', False): ori_key = cfg.init_asso_head[0] new_key = cfg.init_asso_head[1] for _key in model.module.state_dict().keys(): if 'asso' in _key: exist_key = _key.replace(ori_key, new_key) if exist_key in model.module.state_dict().keys(): if dist.get_rank() == 0: print('Init "{}" with "{}"'.format(_key, exist_key)) model.module.state_dict()[_key].copy_( model.module.state_dict()[exist_key]) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)