def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = DATASETS.get(val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # defrost backbone hook when_defrost = cfg.get('when_defrost') if when_defrost is not None: if when_defrost < 0: raise RuntimeError('when_defrost < 0') frozen_stages = cfg.get('frozen_stages', -1) defrost_backbone = DefrostBackbone(when_defrost, frozen_stages) runner.register_hook(defrost_backbone) # log hook custom_log = CustomLog(cfg.data.samples_per_gpu, when_defrost, os.path.join(cfg.work_dir, 'log.txt')) runner.register_hook(custom_log) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume( cfg.resume_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) elif cfg.load_from: runner.load_checkpoint( cfg.load_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', True) # False # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=True, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.wandb: runner.register_hook( WandbLoggerHook(init_kwargs=dict( project=cfg.project, name=cfg.exp_name, config=cfg))) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), # my repeat=getattr(cfg.data, 'repeat', 1)) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16 == True: model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, # my find_unused_parameters=True) # # build runner # my global Runner if cfg.workflow[0][0] is 'search': Runner = SearchRunner # runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): priority = hook.pop('priority', None) if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) if priority is None: runner.register_hook(build_hook(hook, common_params)) else: runner.register_hook(build_hook(hook, common_params), priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpu_ids), dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner if cfg.optimizer.type == 'SGD_GC': optimizer = SGD_GC(model.parameters(), cfg.optimizer.lr, momentum=cfg.optimizer.momentum, weight_decay=cfg.optimizer.weight_decay) else: optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(EvalHook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if validate: cfg.data.test.test_mode = True distributed = False dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) checkpoint_path = os.path.join(cfg.work_dir, 'latest.pth') checkpoint = load_checkpoint(model, checkpoint_path, map_location='cpu') if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) results = dataset.evaluate(outputs, 'keypoints') path = os.path.join(cfg.work_dir, 'keypoints.json') with open(path, 'a') as f: json.dump(results, f) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # # if model.module.bbox_head.freeze_solov2_and_train_combonly: # if model.module.bbox_head.optimize_list is not None: # for (key, param) in model.named_parameters(): # # if 'kernel_convs_convcomb' not in key and 'context_fusion_convs' not in key and 'learned_weight' not in key: # if not any(s in key for s in model.module.bbox_head.optimize_list): # param.requires_grad=False # else: # # print('optimize {}'.format(key)) # logger.info('optimize {}'.format(key)) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) ## add test after training if cfg.data.test.ann_file != 'data/lvis/lvis_v0.5_val_lvis_freqset.json': # if val set is lvis freq, only eval on lvis-freq val set cfg.data.test.test_mode = True dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) model_orig=model.module model = MMDataParallel(model, device_ids=[0]).cuda() data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100] outputs = single_gpu_test(model, data_loader) print('\nwriting results to {}'.format('xxx')) # mmcv.dump(outputs, 'xxx') eval_types = ['segm'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = 'xxx' coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json_segm(dataset, outputs, 'xxx', dump=False) coco_eval(result_files, eval_types, dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = 'xxx' + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file, dump=False) coco_eval(result_files, eval_types, dataset.coco) ##eval on lvis-77###### cfg.data.test.ann_file = 'data/lvis/lvis_v0.5_val_cocofied.json' cfg.data.test.img_prefix = 'data/lvis/val2017/' cfg.data.test.test_mode = True dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # model_orig=model.module # model = MMDataParallel(model, device_ids=[0]).cuda() data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100] outputs = single_gpu_test(model, data_loader) print('\nwriting results to {}'.format('xxx')) # mmcv.dump(outputs, 'xxx') eval_types = ['segm'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = 'xxx' coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json_segm(dataset, outputs, 'xxx', dump=False) from lvis import LVISEval lvisEval = LVISEval('data/lvis/lvis_v0.5_val_cocofied.json', result_files, 'segm') lvisEval.run() lvisEval.print_results() # fix lvis api eval iou_thr error, should be 0.9 but was 0.8999 lvisEval.params.iou_thrs[8] = 0.9 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]: print('AP at iou {}: {}'.format(iou, lvisEval._summarize('ap', iou_thr=iou))) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = 'xxx' + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file, dump=False) coco_eval(result_files, eval_types, dataset.coco) else: ##eval on lvis-freq###### cfg.data.test.test_mode = True dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # model_orig=model.module # model = MMDataParallel(model, device_ids=[0]).cuda() data_loader.dataset.img_infos = data_loader.dataset.img_infos[:100] outputs = single_gpu_test(model, data_loader) print('\nwriting results to {}'.format('xxx')) # mmcv.dump(outputs, 'xxx') eval_types = ['segm'] if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = 'xxx' coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json_segm(dataset, outputs, 'xxx', dump=False) from lvis import LVISEval lvisEval = LVISEval(cfg.data.test.ann_file, result_files, 'segm') lvisEval.run() lvisEval.print_results() # fix lvis api eval iou_thr error, should be 0.9 but was 0.8999 lvisEval.params.iou_thrs[8] = 0.9 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]: print('AP at iou {}: {}'.format(iou, lvisEval._summarize('ap', iou_thr=iou))) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = 'xxx' + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file, dump=False) coco_eval(result_files, eval_types, dataset.coco)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): cfg.data.imgs_per_gpu = int(cfg.data.batch_size) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] if 'use_fp16' in cfg and cfg.use_fp16 == True: raise NotImplementedError('apex do not support non_dist_train!') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) optimizer_config = NonDistOptimizerHook(**cfg.optimizer_config) if not cfg.get('by_iter', False): runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=False, data_loaders=data_loaders) else: common_params = dict(dist_mode=False) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): num_gpus = torch.cuda.device_count() cfg.data.imgs_per_gpu = int(cfg.data.batch_size // num_gpus) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16: model, optimizer = apex.amp.initialize( model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model if next(model.parameters()).is_cuda else model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner if not cfg.get('by_iter', False): runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if not cfg.get('by_iter', False): runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, save_random_weights=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # Load Weights with different Modes if 'pretrain_weights' in cfg: model = partial_load(model, cfg.pretrain_weights, ignore_clf=cfg.ignore_clf, samenclasses=cfg.same_nclasses, convert_dict=cfg.convert_dict if 'convert_dict' in cfg else {}) elif 'trained_weights' in cfg: model.load_state_dict(torch.load(cfg['trained_weights'])['state_dict']) if 'freeze_vars' in cfg: model = freeze_model_partially(model, cfg.freeze_vars) # Save Full config copy in checkpoint dir if not os.path.exists(cfg.work_dir): os.makedirs(cfg.work_dir) config_file = open(os.path.join(cfg.work_dir, 'config.txt'), 'w') for key, value in cfg.items(): config_file.write('%s:%s\n'%(key, value)) repo = Repo('./') config_file.write('\nRepo Branch: %s, Commit: %s\n'%(repo.active_branch, repo.head.commit.hexsha)) config_file.close() # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner( model, batch_processor, optimizer, cfg.work_dir, logger=logger) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)