def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) rank, _ = get_dist_info() # set random seeds if args.seed is not None: if rank == 0: print('set random seed to', args.seed) set_random_seed(args.seed, deterministic=args.deterministic) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) loader_cfg = { **dict((k, cfg.data[k]) for k in ['workers_per_gpu'] if k in cfg.data), **dict(samples_per_gpu=1, drop_last=False, shuffle=False, dist=distributed), **cfg.data.get('test_dataloader', {}) } data_loader = build_dataloader(dataset, **loader_cfg) # build the model and load checkpoint model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) args.save_image = args.save_path is not None empty_cache = cfg.get('empty_cache', False) if not distributed: _ = load_checkpoint(model, args.checkpoint, map_location='cpu') model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, save_path=args.save_path, save_image=args.save_image) else: find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) device_id = torch.cuda.current_device() _ = load_checkpoint( model, args.checkpoint, map_location=lambda storage, loc: storage.cuda(device_id)) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect, save_path=args.save_path, save_image=args.save_image, empty_cache=empty_cache) if rank == 0: print('') # print metrics stats = dataset.evaluate(outputs) for stat in stats: print('Eval-{}: {}'.format(stat, stats[stat])) # save result pickle if args.out: print('writing results to {}'.format(args.out)) mmcv.dump(outputs, args.out)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): """Distributed training function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. validate (bool): Whether to do evaluation. Default: False. logger (logging.Logger | None): Logger for training. Default: None. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None. """ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # step 1: give default values and override (if exist) from cfg.data loader_cfg = dict( seed=cfg.get('seed'), drop_last=False, dist=True, **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'samples_per_gpu', 'workers_per_gpu', 'shuffle', 'seed', 'drop_last', 'prefetch_num', 'pin_memory', ] if k in cfg.data)) # step 2: cfg.data.train_dataloader has highest priority train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizers(model, cfg.optimizers) runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register hooks runner.register_training_hooks( cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if validate and cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.val) if ('val_samples_per_gpu' in cfg.data or 'val_workers_per_gpu' in cfg.data): warnings.warn('"val_samples_per_gpu/val_workers_per_gpu" have ' 'been deprecated. Please use ' '"val_dataloader=dict(samples_per_gpu=1)" instead. ' 'Details see ' 'https://github.com/open-mmlab/mmediting/pull/201') val_loader_cfg = dict( loader_cfg, shuffle=False, drop_last=False, **dict((newk, cfg.data[oldk]) for oldk, newk in [ ('val_samples_per_gpu', 'samples_per_gpu'), ('val_workers_per_gpu', 'workers_per_gpu'), ] if oldk in cfg.data), **cfg.data.get('val_dataloader', {})) data_loader = build_dataloader(dataset, **val_loader_cfg) save_path = osp.join(cfg.work_dir, 'val_visuals') runner.register_hook( DistEvalIterHook( data_loader, save_path=save_path, **cfg.evaluation)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): """Distributed training function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. validate (bool): Whether to do evaluation. Default: False. logger (logging.Logger | None): Logger for training. Default: None. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None. """ # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=True, drop_last=cfg.data.get('drop_last', False), seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizers(model, cfg.optimizers) runner = IterBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if validate and cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.val) samples_per_gpu = cfg.data.get('val_samples_per_gpu', cfg.data.samples_per_gpu) workers_per_gpu = cfg.data.get('val_workers_per_gpu', cfg.data.workers_per_gpu) data_loader = build_dataloader(dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=workers_per_gpu, dist=True, shuffle=False) save_path = osp.join(cfg.work_dir, 'val_visuals') runner.register_hook( DistEvalIterHook(data_loader, save_path=save_path, **cfg.evaluation)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def main(): args = parse_args() checkpoint_list = os.listdir(args.checkpoint_dir) print(checkpoint_list) for checkpoint in checkpoint_list: if '.pth' in checkpoint: cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) rank, _ = get_dist_info() # set random seeds if args.seed is not None: if rank == 0: print('set random seed to', args.seed) set_random_seed(args.seed, deterministic=args.deterministic) # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.get( 'val_workers_per_gpu', cfg.data.workers_per_gpu), dist=distributed, shuffle=False) # build the model and load checkpoint model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) args.save_image = args.save_path is not None # distributed test find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) device_id = torch.cuda.current_device() _ = load_checkpoint( model, os.path.join(args.checkpoint_dir, checkpoint), map_location=lambda storage, loc: storage.cuda(device_id)) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect, save_path=args.save_path, save_image=args.save_image) if rank == 0: # print metrics stats = dataset.evaluate(outputs) write_file = open( os.path.join(args.checkpoint_dir, 'eval_result_new.txt'), 'a') for stat in stats: print('{}: Eval-{}: {}'.format(checkpoint, stat, stats[stat])) write_file.write('{}: Eval-{}: {} '.format( checkpoint, stat, stats[stat])) write_file.write('\n') write_file.close() # save result pickle if args.out: print('writing results to {}'.format(args.out)) mmcv.dump(outputs, args.out)