def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if validate: if isinstance(model.module, RPN): runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) elif cfg.data.val.type == 'CocoDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) elif cfg.data.val.type in ['KittiLiDAR', 'KittiRGB']: runner.register_hook(KittiEvalmAPHook(cfg.data.val, \ interval=cfg.eval_interval)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset_names, cfg, validate=False, **kwargs): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.tasks_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, customized_sampler=not dataset.test_mode) for dataset in dataset_names ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processors[cfg.batch_processor_type], cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # resume if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **kwargs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpus.train), dist=False) ] print('dataloader built') # put model on gpus model = MMDataParallel(model, device_ids=cfg.gpus.train).cuda() print('model paralleled') optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset_train, dataset_val, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset_train, cfg.imgs_per_gpu, cfg.workers_per_gpu, cfg.gpus.__len__(), dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=cfg.gpus).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if validate: print('validate........................') runner.register_hook(NonDistEvalHook(dataset_val, cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train( work_dir, model_cfg, loss_cfg, dataset_cfg, optimizer_cfg, total_epochs, training_hooks, batch_size=None, gpu_batch_size=None, workflow=[('train', 1)], gpus=-1, log_level=0, workers=4, resume_from=None, load_from=None, ): # calculate batch size if gpus < 0: gpus = torch.cuda.device_count() if (batch_size is None) and (gpu_batch_size is not None): batch_size = gpu_batch_size * gpus assert batch_size is not None, 'Please appoint batch_size or gpu_batch_size.' # prepare data loaders if isinstance(dataset_cfg, dict): dataset_cfg = [dataset_cfg] data_loaders = [ torch.utils.data.DataLoader(dataset=call_obj(**d), batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True) for d in dataset_cfg ] # put model on gpus if isinstance(model_cfg, list): model = [call_obj(**c) for c in model_cfg] model = torch.nn.Sequential(*model) else: model = call_obj(**model_cfg) model.apply(weights_init) model = MMDataParallel(model, device_ids=range(gpus)).cuda() loss = call_obj(**loss_cfg) # build runner optimizer = call_obj(params=model.parameters(), **optimizer_cfg) runner = Runner(model, batch_processor, optimizer, work_dir, log_level) runner.register_training_hooks(**training_hooks) if resume_from: runner.resume(resume_from) elif load_from: runner.load_checkpoint(load_from) # run workflow = [tuple(w) for w in workflow] runner.run(data_loaders, workflow, total_epochs, loss=loss)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001) # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, optimizer, cfg.work_dir, get_root_logger(cfg.log_level)) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level, cfg=cfg) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, **kwargs): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, #repeat_samples=cfg.train_cfg.repeat_samples, **kwargs) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) logger = _init_logger(log_dir=cfg.work_dir, level=getattr(logging, cfg.log_level)) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == 'CocoDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if validate: val_dataset_cfg = cfg.data.val dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.iMaterialistDataset): from mmdet.core import iMaterialistEvalmAPHook runner.register_hook(iMaterialistEvalmAPHook(val_dataset_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(CfgSaverHook(cfg), priority='VERY_LOW') if validate: # TODO: should implement dist, nondist version evaluation if isinstance(model.module, RPN): runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == 'CocoDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(NonDistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, train_dataset, cfg, validate=False): build_fn = lambda dataset, cfg_dict: build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, balanced=cfg_dict.get('balanced', False)) datasets = [train_dataset] data_loaders = [build_fn(train_dataset, cfg.data.train)] if validate: val_dataset = get_dataset(cfg.data.val) datasets.append(val_dataset) data_loaders.append(build_fn(val_dataset, cfg.data.val)) # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # load_checkpoint if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) # training runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset_train, dataset_val, cfg, validate=False): # prepare data loaders data_loaders = [build_dataloader(dataset_train, cfg.imgs_per_gpu, cfg.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), find_unused_parameters=True) # build runner optimizer = build_optimizer(model, cfg.optimizer) print('cfg work dir is ', cfg.work_dir) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: print('validate........................') interval = cfg.get('validate_interval', 1) runner.register_hook(DistEvalMonoHook(dataset_val, interval, cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # ipdb.set_trace() # prepare data loaders # 返回dataloader的迭代器,采用pytorch的DataLoader方法封装数据集 data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus # 这里多GPU输入没用list而是迭代器,注意单GPU是range(0,1),遍历的时候只有0 model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner # 这里只是将相关参数传递,没有任何的构建和运行。就像model的build一样只放了模块没有连接和顺序 runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # 注册钩子 runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # 断点加载或文件加载数据 if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(EvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if hasattr(cfg, 'extra_hooks'): import detector_utils.pytorch.utils.mmcv_custom_hooks for hook_args in cfg.extra_hooks: hook_type_name = hook_args['type'] del hook_args['type'] assert hasattr(detector_utils.pytorch.utils.mmcv_custom_hooks, hook_type_name), \ f"Unknown hook name: {hook_type_name}" hook_type = getattr(detector_utils.pytorch.utils.mmcv_custom_hooks, hook_type_name) hook = hook_type(**hook_args) runner.register_hook(hook) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset ] if 'use_fp16' in cfg and cfg.use_fp16 == True: raise NotImplementedError('apex do not support non_dist_train!') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register custom hooks for hook in cfg.get('custom_hooks', ()): priority = hook.pop('priority', None) if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=False, data_loaders=data_loaders) else: common_params = dict(dist_mode=False) if priority is None: runner.register_hook(build_hook(hook, common_params)) else: runner.register_hook(build_hook(hook, common_params), priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, # sometime will have a error, can enabled the next line find_unused_parameters=True) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_smpl_detector_fuse(model, datasets, cfg, **kwargs): # prepare data loaders data_loaders = [ build_dataloader_fuse( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for dataset in datasets ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config # TODO: Build up a logger here that inherit the hook class runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) # No need to add hook for validation. # We shall return the losses inside the loss function. # I won't re-use the code for the Evaluation Hook. # The evaluation result will be passed to log only. # To crete a `latest.pth` file to run recursively if kwargs.get('create_dummy', False): print('Create a dummy checkpoint for recursive training') runner._epoch -= 1 runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth') runner._epoch += 1 return pretrain_path = kwargs.get('load_pretrain', None) if kwargs.get('load_pretrain', None): print(f"Load pretrained model from {pretrain_path}") runner._epoch -= 1 runner.load_checkpoint(pretrain_path) runner.save_checkpoint(cfg.work_dir, filename_tmpl='pretrained_{}.pth') runner._epoch += 1 return if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, time_limit=getattr(cfg, 'time_limit', None))
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders try: selsa_imgs = cfg.data.selsa_imgs except Exception: selsa_imgs = cfg.data.imgs_per_gpu dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, selsa_imgs=selsa_imgs, dist=True) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) mod = cfg.pop('mod', False) # build runner optimizer = build_optimizer(model, cfg.optimizer, mod) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16: model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model if next(model.parameters()).is_cuda else model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset) or issubclass(dataset_type, datasets.Acoustic): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): raise NotImplementedError else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoNonDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) elif issubclass(dataset_type, datasets.SeqVIDDataset): runner.register_hook( NonDistSeqEvalmAPHook(val_dataset_cfg, **eval_cfg)) elif issubclass(dataset_type, datasets.PairVIDDataset) \ or issubclass(dataset_type, datasets.TwinVIDDataset): runner.register_hook( NonDistPairEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( NonDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train( work_dir, model_cfg, loss_cfg, dataset_cfg, optimizer_cfg, batch_size, total_epochs, training_hooks, workflow=[('train', 1)], gpus=1, log_level=0, workers=2, resume_from=None, load_from=None, ): # prepare data loaders if isinstance(dataset_cfg, dict): dataset_cfg = [dataset_cfg] data_loaders = [ torch.utils.data.DataLoader(dataset=call_obj(**d), batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True) for d in dataset_cfg ] # put model on gpus if isinstance(model_cfg, list): model = [call_obj(**c) for c in model_cfg] model = torch.nn.Sequential(*model) else: model = call_obj(**model_cfg) model.apply(weights_init) print("Model size: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) model = MMDataParallel(model, device_ids=range(gpus)).cuda() loss = call_obj(**loss_cfg) # build runner optimizer = call_obj(params=model.parameters(), **optimizer_cfg) runner = Runner(model, batch_processor, optimizer, work_dir, log_level) runner.register_training_hooks(**training_hooks) if resume_from: runner.resume(resume_from) elif load_from: runner.load_checkpoint(load_from) # run workflow = [tuple(w) for w in workflow] runner.run(data_loaders, workflow, total_epochs, loss=loss) writer.export_scalars_to_json("./all_scalars.json") writer.close()
def _dist_train(model, dataset, cfg, validate=False): # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer, cfg.get('optimizer_exclude_arch')) arch_name = None optimizer_arch = None if 'optimizer_arch' in cfg: raise NotImplementedError runner = Runner(model, batch_processor, optimizer, optimizer_arch, cfg.work_dir, cfg.log_level, arch_name=arch_name) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) optimizer_arch_config = DistOptimizerArchHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, optimizer_arch_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if 'optimizer_arch' in cfg: raise NotImplementedError else: data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] runner.run(data_loaders, None, cfg.workflow, cfg.total_epochs)
def train(cfg_path, dataset_class): """借用mmcv的Runner框架进行训练,包括里边的hooks作为lr更新,loss计算的工具 1. dataset的数据集输出打包了img/gt_bbox/label/,采用DataContainer封装 2. Dataloader的default_collate用定制collate替换,从而支持dataset的多类型数据 3. DataParallel外壳用定制MMDataparallel替换,从而支持DataContainer """ # 初始化2个默认选项 distributed = False parallel = True # get cfg cfg = Config.fromfile(cfg_path) # set backends if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # get logger logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) logger.info('DataParallel training: {}'.format(parallel)) # build model & detector # model = M2detDetector(cfg) model = M2detDetector(cfg) if not parallel: model = model.cuda() else: model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # prepare data & dataloader # Runner要求dataloader放在list里: 使workflow里每个flow对应一个dataloader dataset = get_dataset(cfg.data.train, dataset_class) batch_size = cfg.gpus * cfg.data.imgs_per_gpu num_workers = cfg.gpus * cfg.data.workers_per_gpu dataloader = [ DataLoader(dataset, batch_size=batch_size, sampler=GroupSampler(dataset, cfg.data.imgs_per_gpu), num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=cfg.data.imgs_per_gpu), pin_memory=False) ] # define runner and running type(1.resume, 2.load, 3.train/test) runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: # 恢复训练: './work_dirs/ssd300_voc/latest.pth' runner.resume(cfg.resume_from, map_location=lambda storage, loc: storage) elif cfg.load_from: # 加载参数进行测试 runner.load_checkpoint(cfg.load_from) # 开始训练: 采用workflow来区分train还是test runner.run(dataloader, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer, cfg.get('optimizer_exclude_arch')) arch_name = None optimizer_arch = None if 'optimizer_arch' in cfg: raise NotImplementedError runner = Runner(model, batch_processor, optimizer, optimizer_arch, cfg.work_dir, cfg.log_level, arch_name=arch_name) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config optimizer_arch_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, optimizer_arch_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if 'optimizer_arch' in cfg: raise NotImplementedError else: data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] runner.run(data_loaders, None, cfg.workflow, cfg.total_epochs)