def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus, according to the cfg Args: cfg -- VISSL yaml configuration node_id -- node_id for this node engine_name -- what engine to run: train or extract_features hook_generator -- Callback to generate all the ClassyVision hooks for this engine """ node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) copy_to_local(cfg) # given the checkpoint folder, we check that there's not already a final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info( f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to load from. The load_checkpoints function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists( symlink_checkpoint_path): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder) try: if world_size > 1: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) else: _distributed_worker( local_rank=0, cfg=cfg, node_id=node_id, dist_run_id=dist_run_id, engine_name=engine_name, checkpoint_path=checkpoint_path, checkpoint_folder=checkpoint_folder, hook_generator=hook_generator, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: cleanup_local_dir(cfg) logging.info("All Done!")
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus of the current node according to the cfg. If more than 1 nodes are needed for training, this function should be called on each of the different nodes, each time with an unique node_id in the range [0..N-1] if N is the total number of nodes to take part in training. Alternatively, you can use SLURM or any cluster management system to run this function for you. Configure the node_id, dist_run_id, setup the environment variabled Args: cfg (AttrDict): VISSL yaml configuration node_id (int): node_id for this node engine_name (str): what engine to run: train or extract_features hook_generator (Callable): Callback to generate all the ClassyVision hooks for this engine """ setup_logging(__name__) node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) # If using gpus, we check that the user has specified <= gpus available on user system. if cfg.MACHINE.DEVICE == "gpu": assert cfg.DISTRIBUTED.NUM_PROC_PER_NODE <= torch.cuda.device_count( ), (f"User system doesn't have requested {cfg.DISTRIBUTED.NUM_PROC_PER_NODE} gpus " f"available. Number of gpus found on user system={torch.cuda.device_count()}. " "Please set the DISTRIBUTED.NUM_PROC_PER_NODE properly.") # set the environment variables including local rank, node id etc. set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) # given the checkpoint folder, we check that there's not already a final checkpoint # and that if there already exists a final checkpoint and user is not overriding # to ignore the final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info( f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to resume from. The get_resume_checkpoint function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and g_pathmgr.exists( symlink_checkpoint_path): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder) # assert that if the user set the PARAMS_FILE, it must exist and be valid. # we only use the PARAMS_FILE init if the checkpoint doesn't exist for the # given training. This ensures that if the same training resumes, then it # resumes from the checkpoint and not the weight init if checkpoint_path is None and cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]: params_file = cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"] error_message = f"Specified PARAMS_FILE does NOT exist: {params_file}" assert g_pathmgr.exists(params_file), error_message # copy the data to local if user wants. This can speed up dataloading. _copy_to_local(cfg) try: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: _cleanup_local_dir(cfg) logging.info("All Done!") shutdown_logging()
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus of the current node according to the cfg. If more than 1 nodes are needed for training, this function should be called on each of the different nodes, each time with an unique node_id in the range [0..N-1] if N is the total number of nodes to take part in training. Alternatively, you can use SLURM or any cluster management system to run this function for you. Configure the node_id, dist_run_id, setup the environment variabled Args: cfg (AttrDict): VISSL yaml configuration node_id (int): node_id for this node engine_name (str): what engine to run: train or extract_features hook_generator (Callable): Callback to generate all the ClassyVision hooks for this engine """ setup_logging(__name__) node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) _copy_to_local(cfg) # given the checkpoint folder, we check that there's not already a final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info(f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to load from. The load_checkpoints function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists( symlink_checkpoint_path ): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder ) try: if world_size > 1: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) else: _distributed_worker( local_rank=0, cfg=cfg, node_id=node_id, dist_run_id=dist_run_id, engine_name=engine_name, checkpoint_path=checkpoint_path, checkpoint_folder=checkpoint_folder, hook_generator=hook_generator, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: _cleanup_local_dir(cfg) logging.info("All Done!") shutdown_logging()