def _infer_slurm_init(cfg: DistributedTrainingConfig, num_pipelines_per_node): node_list = os.environ.get("SLURM_STEP_NODELIST") if node_list is None: node_list = os.environ.get("SLURM_JOB_NODELIST") if node_list is not None: try: hostnames = subprocess.check_output( ["scontrol", "show", "hostnames", node_list] ) cfg.distributed_init_method = "tcp://{host}:{port}".format( host=hostnames.split()[0].decode("utf-8"), port=cfg.distributed_port, ) nnodes = int(os.environ.get("SLURM_NNODES")) ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE") if ntasks_per_node is not None: ntasks_per_node = int(ntasks_per_node) else: ntasks = int(os.environ.get("SLURM_NTASKS")) nnodes = int(os.environ.get("SLURM_NNODES")) assert ntasks % nnodes == 0 ntasks_per_node = int(ntasks / nnodes) if ntasks_per_node == 1: gpus_per_node = torch.cuda.device_count() node_id = int(os.environ.get("SLURM_NODEID")) cfg.distributed_rank = node_id * gpus_per_node cfg.distributed_world_size = nnodes * gpus_per_node elif cfg.pipeline_model_parallel: assert ntasks_per_node == num_pipelines_per_node, ( "SLURM --ntasks-per-node must match number of pipelines per " "node (={})".format(num_pipelines_per_node) ) cfg.distributed_no_spawn = True # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on # the first node, [1, 2] on the second node, etc. This # matches torch.distributed.launch. node_id = int(os.environ.get("SLURM_NODEID")) local_id = int(os.environ.get("SLURM_LOCALID")) cfg.distributed_rank = node_id * num_pipelines_per_node + local_id # In the above example, device_id will always be in [0, 1], # which also matches torch.distributed.launch. cfg.device_id = local_id # We also want to set distributed_world_size to be the total # number of pipelines across all nodes. cfg.distributed_world_size = nnodes * num_pipelines_per_node else: assert ntasks_per_node == cfg.distributed_world_size // nnodes cfg.distributed_no_spawn = True cfg.distributed_rank = int(os.environ.get("SLURM_PROCID")) cfg.device_id = int(os.environ.get("SLURM_LOCALID")) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError: # Slurm is not installed pass
def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig): cfg.distributed_init_method = "env://" cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) cfg.distributed_rank = int(os.environ["RANK"]) # processes are created by torch.distributed.launch cfg.distributed_no_spawn = True if os.environ["CUDA_VISIBLE_DEVICES"]: cfg.device_id = int( os.environ["CUDA_VISIBLE_DEVICES"].split(',')[cfg.local_rank]) else: cfg.device_id = cfg.local_rank
def _pipeline_parallel_post_init( cfg: DistributedTrainingConfig, num_pipeline_devices, num_pipelines_per_node ): if not cfg.distributed_no_spawn: # When distributed_no_spawn is False, we expect distributed_rank and # distributed_world_size to be based on the total number of GPUs, so # we need to correct them to be based on the number of pipelines. assert cfg.distributed_world_size % num_pipeline_devices == 0 cfg.distributed_world_size = ( cfg.distributed_world_size // num_pipeline_devices ) # In the case of 4-way MP on nodes with 8 GPUs, we want # distributed_rank to be the starting GPU index for each pipeline # i.e., 0, 2, ... gpus_per_node = torch.cuda.device_count() assert cfg.distributed_rank % gpus_per_node == 0 assert cfg.distributed_rank % num_pipeline_devices == 0 with open_dict(cfg): cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices # launch one process per pipeline cfg.distributed_num_procs = num_pipelines_per_node # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 # and 4, indicating the starting device IDs for each pipeline cfg.device_id *= num_pipeline_devices if cfg.device_id > 0: # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 # GPU node), we need to adjust pipeline_devices accordingly logger.debug( "setting CUDA device={} on rank {}".format( cfg.device_id, cfg.distributed_rank ) ) torch.cuda.set_device(cfg.device_id) with open_dict(cfg): cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices] logger.info( "setting pipeline_devices={} on rank {}".format( cfg.pipeline_devices, cfg.distributed_rank ) )
def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False): if cfg.distributed_init_method is not None or cfg.tpu: return if cfg.pipeline_model_parallel: balance_exists = (cfg.pipeline_balance is not None or cfg.pipeline_encoder_balance is not None or cfg.pipeline_decoder_balance is not None) devices_exist = (cfg.pipeline_devices is not None or cfg.pipeline_encoder_devices is not None or cfg.pipeline_decoder_devices is not None) if not balance_exists: raise ValueError( "--pipeline-balance is currently required for pipeline model parallelism" ) if not devices_exist: raise ValueError( "--pipeline-devices is currently required for pipeline model parallelism" ) cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int) if cfg.pipeline_devices is not None: cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int) num_pipeline_devices = len(set(cfg.pipeline_devices)) else: cfg.pipeline_encoder_devices = utils.eval_str_list( cfg.pipeline_encoder_devices, type=int) cfg.pipeline_decoder_devices = utils.eval_str_list( cfg.pipeline_decoder_devices, type=int) num_pipeline_devices = len( set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)) gpus_per_node = torch.cuda.device_count() assert ( gpus_per_node >= num_pipeline_devices and gpus_per_node % num_pipeline_devices == 0 ), ("the number of unique device IDs in --pipeline-devices must evenly divide " "the number of GPUs per node (multi-node pipelining is not yet supported)" ) num_pipelines_per_node = gpus_per_node // num_pipeline_devices # support torch.distributed.launch if all(key in os.environ for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]): cfg.distributed_init_method = "env://" cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) cfg.distributed_rank = int(os.environ["RANK"]) # processes are created by torch.distributed.launch cfg.distributed_no_spawn = True # we can determine the init method automatically for Slurm elif cfg.distributed_port > 0: node_list = os.environ.get("SLURM_STEP_NODELIST") if node_list is None: node_list = os.environ.get("SLURM_JOB_NODELIST") if node_list is not None: try: hostnames = subprocess.check_output( ["scontrol", "show", "hostnames", node_list]) cfg.distributed_init_method = "tcp://{host}:{port}".format( host=hostnames.split()[0].decode("utf-8"), port=cfg.distributed_port, ) nnodes = int(os.environ.get("SLURM_NNODES")) ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE") if ntasks_per_node is not None: ntasks_per_node = int(ntasks_per_node) else: ntasks = int(os.environ.get("SLURM_NTASKS")) nnodes = int(os.environ.get("SLURM_NNODES")) assert ntasks % nnodes == 0 ntasks_per_node = int(ntasks / nnodes) if ntasks_per_node == 1: gpus_per_node = torch.cuda.device_count() node_id = int(os.environ.get("SLURM_NODEID")) cfg.distributed_rank = node_id * gpus_per_node cfg.distributed_world_size = nnodes * gpus_per_node elif cfg.pipeline_model_parallel: assert ntasks_per_node == num_pipelines_per_node, ( "SLURM --ntasks-per-node must match number of pipelines per " "node (={})".format(num_pipelines_per_node)) cfg.distributed_no_spawn = True # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on # the first node, [1, 2] on the second node, etc. This # matches torch.distributed.launch. node_id = int(os.environ.get("SLURM_NODEID")) local_id = int(os.environ.get("SLURM_LOCALID")) cfg.distributed_rank = node_id * num_pipelines_per_node + local_id # In the above example, device_id will always be in [0, 1], # which also matches torch.distributed.launch. cfg.device_id = local_id # We also want to set distributed_world_size to be the total # number of pipelines across all nodes. cfg.distributed_world_size = nnodes * num_pipelines_per_node else: assert ntasks_per_node == cfg.distributed_world_size // nnodes cfg.distributed_no_spawn = True cfg.distributed_rank = int(os.environ.get("SLURM_PROCID")) cfg.device_id = int(os.environ.get("SLURM_LOCALID")) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError: # Slurm is not installed pass elif cfg.distributed_world_size > 1 or force_distributed: # fallback for single node with multiple GPUs assert ( cfg.distributed_world_size <= torch.cuda.device_count() ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices" port = random.randint(10000, 20000) cfg.distributed_init_method = "tcp://localhost:{port}".format( port=port) if cfg.pipeline_model_parallel: if not cfg.distributed_no_spawn: # When distributed_no_spawn is False, we expect distributed_rank and # distributed_world_size to be based on the total number of GPUs, so # we need to correct them to be based on the number of pipelines. assert cfg.distributed_world_size % num_pipeline_devices == 0 cfg.distributed_world_size = (cfg.distributed_world_size // num_pipeline_devices) # In the case of 4-way MP on nodes with 8 GPUs, we want # distributed_rank to be the starting GPU index for each pipeline # i.e., 0, 2, ... assert cfg.distributed_rank % gpus_per_node == 0 assert cfg.distributed_rank % num_pipeline_devices == 0 with open_dict(cfg): cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices # launch one process per pipeline cfg.distributed_num_procs = num_pipelines_per_node # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 # and 4, indicating the starting device IDs for each pipeline cfg.device_id *= num_pipeline_devices if cfg.device_id > 0: # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 # GPU node), we need to adjust pipeline_devices accordingly logger.debug("setting CUDA device={} on rank {}".format( cfg.device_id, cfg.distributed_rank)) torch.cuda.set_device(cfg.device_id) with open_dict(cfg): cfg.pipeline_devices = [ cfg.device_id + d for d in cfg.pipeline_devices ] logger.info("setting pipeline_devices={} on rank {}".format( cfg.pipeline_devices, cfg.distributed_rank)) elif not cfg.distributed_no_spawn: with open_dict(cfg): cfg.distributed_num_procs = min(torch.cuda.device_count(), cfg.distributed_world_size)
def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig): cfg.distributed_init_method = "env://" cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) cfg.distributed_rank = int(os.environ["RANK"]) # processes are created by torch.distributed.launch cfg.distributed_no_spawn = True