def setup_after_launch(cfg, output_dir, runner): """ Set things up after entering DDP, including - creating working directory - setting up logger - logging environment - initializing runner """ create_dir_on_global_main_process(output_dir) comm.synchronize() setup_loggers(output_dir) cfg.freeze() if cfg.OUTPUT_DIR != output_dir: with temp_defrost(cfg): logger.warning( "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}". format(cfg.OUTPUT_DIR, output_dir)) cfg.OUTPUT_DIR = output_dir logger.info("Initializing runner ...") runner = initialize_runner(runner, cfg) log_info(cfg, runner) dump_cfg(cfg, os.path.join(output_dir, "config.yaml")) auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
def setup_after_launch(cfg: CN, output_dir: str, runner): _setup_after_launch(cfg, output_dir, runner) logger.info("Initializing runner ...") runner = initialize_runner(runner, cfg) log_info(cfg, runner) auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
def test_not_scale_for_zero_world_size(self): """ when reference world size is 0, no scaling should happen """ cfg = GeneralizedRCNNRunner().get_default_cfg() self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 8) cfg.SOLVER.REFERENCE_WORLD_SIZE = 0 batch_size_x8 = cfg.SOLVER.IMS_PER_BATCH auto_scale_world_size(cfg, new_world_size=1) self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 0) self.assertEqual(cfg.SOLVER.IMS_PER_BATCH, batch_size_x8)
def test_8gpu_to_1gpu(self): """ when scaling a 8-gpu config to 1-gpu one, the batch size will be reduced by 8x """ cfg = GeneralizedRCNNRunner().get_default_cfg() self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 8) batch_size_x8 = cfg.SOLVER.IMS_PER_BATCH assert batch_size_x8 % 8 == 0, "default batch size is not multiple of 8" auto_scale_world_size(cfg, new_world_size=1) self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 1) self.assertEqual(cfg.SOLVER.IMS_PER_BATCH * 8, batch_size_x8)
def setup_after_launch( cfg: CfgNode, output_dir: str, runner: Optional[BaseRunner] = None, _scale_world_size: bool = True, # HACK: temporarily allow lightning_train_net to by pass this. ): """ Binary-level setup after entering DDP, including - creating working directory - setting up logger - logging environment - printing and dumping config - (optional) initializing runner """ create_dir_on_global_main_process(output_dir) setup_loggers(output_dir) log_system_info() cfg.freeze() maybe_override_output_dir(cfg, output_dir) logger.info("Running with full config:\n{}".format(cfg)) dump_cfg(cfg, os.path.join(output_dir, "config.yaml")) if runner: logger.info("Initializing runner ...") runner = initialize_runner(runner, cfg) logger.info("Running with runner: {}".format(runner)) # save the diff config if runner: default_cfg = runner.get_default_cfg() dump_cfg( get_diff_cfg(default_cfg, cfg), os.path.join(output_dir, "diff_config.yaml"), ) else: # TODO: support getting default_cfg without runner. pass # scale the config after dumping so that dumped config files keep original world size if _scale_world_size: auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
def main( cfg: CfgNode, output_dir: str, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_processes: int = 1, ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_processes: Number of processes on each node. eval_only: True if run evaluation only. """ # FIXME: make comm.get_world_size() work properly. setup_after_launch(cfg, output_dir, _scale_world_size=False) auto_scale_world_size(cfg, new_world_size=num_machines * num_processes) task = task_cls.from_config(cfg, eval_only) trainer_params = get_trainer_params(cfg, num_machines, num_processes) last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=trainer_params["logger"].log_dir, accuracy=task.eval_res, model_configs=model_configs, )
def main( cfg: CfgNode, output_dir: Optional[str] = None, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_gpus: int = 0, num_processes: int = 1, ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_gpus: Number of GPUs to train on each node num_processes: Number of processes on each node. NOTE: Automatically set to the number of GPUs when using DDP. Set a value greater than 1 to mimic distributed training on CPUs. eval_only: True if run evaluation only. """ assert (num_processes == 1 or num_gpus == 0), "Only set num_processes > 1 when training on CPUs" auto_scale_world_size(cfg, num_machines * num_gpus) maybe_override_output_dir(cfg, output_dir) task = task_cls.from_config(cfg, eval_only) tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR) trainer_params = { # training loop is bounded by max steps, use a large max_epochs to make # sure max_steps is met first "max_epochs": 10**8, "max_steps": cfg.SOLVER.MAX_ITER, "val_check_interval": cfg.TEST.EVAL_PERIOD if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER, "num_nodes": num_machines, "gpus": num_gpus, "num_processes": num_processes, "accelerator": get_accelerator(cfg.MODEL.DEVICE), "callbacks": _get_trainer_callbacks(cfg), "logger": tb_logger, "num_sanity_val_steps": 0, "progress_bar_refresh_rate": 10, } last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=tb_logger.log_dir, accuracy=task.eval_res, model_configs=model_configs, )