def __init__( self, use_gpu: Optional[bool] = None, num_dataloader_workers: int = 0, dataloader_mp_context: Optional[str] = None, ): """Constructor for DistributedTrainer. Args: use_gpu: If true, then use GPU 0 for training. If None, then check if we have GPUs available, if we do then use GPU for training. num_dataloader_workers: Number of CPU processes doing dataloading per GPU. If 0, then dataloading is done on main thread. dataloader_mp_context: Determines how to launch new processes for dataloading. Must be one of "fork", "forkserver", "spawn". If None, process launching is inherited from parent. """ super().__init__( use_gpu=use_gpu, num_dataloader_workers=num_dataloader_workers, dataloader_mp_context=dataloader_mp_context, ) _init_env_vars() _init_distributed(self.use_gpu) logging.info( f"Done setting up distributed process_group with rank {get_rank()}" + f", world_size {get_world_size()}") local_rank = int(os.environ["LOCAL_RANK"]) if self.use_gpu: logging.info("Using GPU, CUDA device index: {}".format(local_rank)) set_cuda_device_index(local_rank) else: logging.info("Using CPU") set_cpu_device()
def __init__( self, use_gpu, num_dataloader_workers, elastic_coordinator, input_args, local_rank, dataloader_mp_context=None, ): super().__init__( use_gpu=use_gpu, num_dataloader_workers=num_dataloader_workers, dataloader_mp_context=dataloader_mp_context, ) pid = os.getpid() if use_gpu: set_cuda_device_index(local_rank) device_idx = torch.cuda.current_device() log.info( f"initialized worker {local_rank} (pid={pid}, gpu={device_idx})" ) device_properties = torch.cuda.get_device_properties(device_idx) log.info(f"gpu device properties: {device_properties}") else: # cpu set_cpu_device() log.info(f"initialized worker {local_rank} (pid={pid}, cpu)") self.elastic_coordinator = elastic_coordinator self.input_args = input_args
def __init__( self, use_gpu: Optional[bool] = None, num_dataloader_workers: int = 0, dataloader_mp_context: Optional[str] = None, ): """Constructor for LocalTrainer. Args: use_gpu: If true, then use GPU 0 for training. If None, then check if we have GPUs available, if we do then use GPU for training. num_dataloader_workers: Number of CPU processes doing dataloading per GPU. If 0, then dataloading is done on main thread. dataloader_mp_context: Determines how to launch new processes for dataloading. Must be one of "fork", "forkserver", "spawn". If None, process launching is inherited from parent. """ super().__init__( use_gpu=use_gpu, num_dataloader_workers=num_dataloader_workers, dataloader_mp_context=dataloader_mp_context, ) if self.use_gpu: logging.info("Using GPU, CUDA device index: {}".format(0)) set_cuda_device_index(0) else: logging.info("Using CPU") set_cpu_device()
def train(self, task): if task.use_gpu: logging.info("Using GPU, CUDA device index: {}".format(0)) set_cuda_device_index(0) else: logging.info("Using CPU") set_cpu_device() super().train(task)
def train(self, task): _init_env_vars(task.use_gpu) _init_distributed(task.use_gpu) logging.info( f"Done setting up distributed process_group with rank {get_rank()}" + f", world_size {get_world_size()}") local_rank = int(os.environ["LOCAL_RANK"]) if task.use_gpu: logging.info("Using GPU, CUDA device index: {}".format(local_rank)) set_cuda_device_index(local_rank) else: logging.info("Using CPU") set_cpu_device() super().train(task)
def setup_distributed(self, use_gpu: bool): """ Setup the distributed training. VISSL support both GPU and CPU only training. (1) Initialize the torch.distributed.init_process_group if the distributed is not already initialized. The init_method, backend are specified by user in the yaml config file. See vissl/defaults.yaml file for description on how to set init_method, backend. (2) We also set the global cuda device index using torch.cuda.set_device or cpu device """ # we overwrite the distributed trainer setup here with our config options distributed_world_size = int(os.environ["WORLD_SIZE"]) assert distributed_world_size % self.cfg.DISTRIBUTED.NUM_NODES == 0 init_method = f"{self.cfg.DISTRIBUTED.INIT_METHOD}://{self.dist_run_id}" logging.info( f"Using Distributed init method: {init_method}, " f"world_size: {distributed_world_size}, rank: {self.distributed_rank}" ) if not torch.distributed.is_initialized(): torch.distributed.init_process_group( backend=self.cfg.DISTRIBUTED.BACKEND, init_method=init_method, world_size=distributed_world_size, rank=self.distributed_rank, ) else: logging.warning( "Torch distributed has already been initialized, \ reusing existing configuration" ) logging.info( "| initialized host {} as rank {} ({})".format( socket.gethostname(), self.distributed_rank, torch.distributed.get_rank(), ) ) if use_gpu: set_cuda_device_index(self.local_rank) # perform a dummy all-reduce to initialize the NCCL communicator if torch.cuda.is_available() and (self.cfg.DISTRIBUTED.BACKEND == "nccl"): dist.all_reduce(torch.zeros(1).cuda()) else: set_cpu_device()