def train(model, datasets, cfg, rank): data_loaders = [] for ds in datasets: data_loaders.append(get_loader(ds, cfg, 'train')) # build runner for training if cfg.get('total_iters', None) is not None: runner = IterBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) total_iters_or_epochs = cfg.total_iters else: runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) assert cfg.get('total_epochs', None) is not None total_iters_or_epochs = cfg.total_epochs # resume and create optimizers if cfg.resume_from is not None: # 恢复之前的训练(包括模型参数和优化器) runner.resume(cfg.resume_from, cfg.get('resume_optim', False)) elif cfg.load_from is not None: # 假装从头开始训练, rank0 进程加载参数,然后每个进程创建optim,调用optim init时,模型参数会自动同步 runner.load_checkpoint(cfg.load_from, load_optim=False) runner.create_optimizers() else: # 不加载任何参数,每个进程直接创建optimizers runner.create_optimizers() # register hooks runner.register_training_hooks(lr_config=cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.eval) save_path = os.path.join(cfg.work_dir, 'eval_visuals') log_path = cfg.work_dir runner.register_hook( EvalIterHook(get_loader(dataset, cfg, 'eval'), save_path=save_path, log_path=log_path, **cfg.evaluation)) runner.run(data_loaders, cfg.workflow, total_iters_or_epochs)
def test(model, datasets, cfg, rank): data_loaders = [get_loader(ds, cfg) for ds in datasets] runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) runner.load_checkpoint(cfg.load_from, load_optim=False) runner.run(data_loaders, cfg.workflow, 1)
def test(model, datasets, cfg, rank): data_loaders = [] for ds in datasets: data_loaders.append(get_loader(ds, cfg)) # build epoch runner for test runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) # load from if cfg.load_from is not None: runner.load_checkpoint(cfg.load_from, load_optim=False) runner.create_optimizers() else: raise RuntimeError("cfg.load_from should not be None for test") runner.run(data_loaders, cfg.workflow, 8 if cfg.ensemble else 1)
def train(model, datasets, cfg, rank): data_loaders = [get_loader(ds, cfg, 'train') for ds in datasets] runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) runner.create_gradmanager_and_optimizers( ) # 每个进程均创建gm和optimizers, 均是model的属性 if cfg.resume_from is not None: # 恢复之前的训练,即epoch数目(包括模型参数和优化器)。若多卡训练则只有rank 0进程对模型加载参数(后面会同步)。如果resume optim,则每个进程均会load optim state. runner.resume(cfg.resume_from, cfg.get('resume_optim', True)) elif cfg.load_from is not None: # 加载参数,但假装从头开始训练。若多卡训练则只有rank 0进程对模型加载参数 (后面会同步)。 runner.load_checkpoint(cfg.load_from, load_optim=False) else: pass # 不加载任何参数,从头训练 # 对模型参数进行同步 runner.sync_model_params() # register some useful hooks runner.register_training_hooks(lr_config=cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # register evaluation hook if cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.eval) save_path = os.path.join(cfg.work_dir, 'eval_visuals') log_path = os.path.join(cfg.work_dir, 'eval.log') runner.register_hook( EvalIterHook(get_loader(dataset, cfg, 'eval'), save_path=save_path, log_path=log_path, **cfg.evaluation)) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train(model, datasets, cfg, rank): data_loaders = [get_loader(ds, cfg, 'train') for ds in datasets] runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) runner.create_gradmanager_and_optimizers() if cfg.resume_from is not None: runner.resume(cfg.resume_from, cfg.get('resume_optim', True)) elif cfg.load_from is not None: runner.load_checkpoint(cfg.load_from, load_optim=False) else: pass runner.sync_model_params() # register some useful hooks runner.register_training_hooks(lr_config=cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # register evaluation hook if cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.eval) save_path = os.path.join(cfg.work_dir, 'eval_visuals') log_path = os.path.join(cfg.work_dir, 'eval.log') runner.register_hook( EvalIterHook(get_loader(dataset, cfg, 'eval'), save_path=save_path, log_path=log_path, **cfg.evaluation)) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)