def _get_nccl_p2p_communicator(self, comm_key, my_gpu_idx, peer_rank, peer_gpu_idx): """Create or retrieve an NCCL communicator for p2p tasks. Note(Hao): this function is not thread-safe now. Args: comm_key (str): communicator key. my_gpu_idx (int): the gpu index on the current process. peer_rank (int): the rank of the destination process. peer_gpu_idx (int): the gpu index on the peer process. Returns: communicator """ if not comm_key: raise RuntimeError("Got empty communicator key.") # TODO(Hao): lock the _dev_comm_map here. if comm_key in self._dev_comm_map: return self._dev_comm_map[comm_key] # Note (Hao): This is a bit complex so I decide to take a note here. # Here we need to consider three cases: # Case 1: src_rank != dst_rank, hence the send and recv happen on # different process (actors/tasks); each process makes independent # collective calls and manages corresponding communicators. # Case 2: src_rank == dst_rank, src_gpu_idx == dst_gpu_idx; for # this case, we simply throw a RuntimeError; # Case 3: src_rank == dst_rank, src_gpu_idx != dst_gpu_idx, which # means the send and recv will be called on the same process. We # DO NOT support this case for now. We need to properly scope: # (1) communicators creation, and # (2) send/recv calls # using groupStart(( and groupEnd() calls to avoid deadlocks. if self.rank < peer_rank: my_p2p_rank = 0 elif self.rank > peer_rank: my_p2p_rank = 1 else: raise RuntimeError( "Send and recv happens on the same process! " "ray.util.collective does not support this case as of now. " "Alternatively, consider doing GPU to GPU memcpy?") group_key = self._generate_group_key(comm_key) if my_p2p_rank == 0: nccl_uid = self._generate_nccl_uid(group_key) else: rendezvous = Rendezvous(group_key) rendezvous.meet() nccl_uid = rendezvous.get_nccl_id() # create the p2p communicators with nccl_util.Device(my_gpu_idx): comm = nccl_util.create_nccl_communicator(2, nccl_uid, my_p2p_rank) stream = cupy.cuda.Stream.null # Stream(non_blocking=True) self._dev_comm_map[comm_key] = [comm] self._dev_streams_map[comm_key] = [stream] return [comm]
def _sync_streams(device_list, events, streams): """Let NCCL streams wait for current streams for every device.""" # TODO(Fu): recordStream besides calling this function? if ENV.NCCL_USE_MULTISTREAM.val: for i, device in enumerate(device_list): with nccl_util.Device(device): events[i].record(cupy.cuda.get_current_stream()) streams[i].wait_event(events[i])
def _get_nccl_collective_communicator(self, comm_key, device_list): """Create or retrieve an NCCL communicator from cache. If the communicator is found in cache, return the communicator. If not, a communicator and a stream will be created and put in cache. TODO(Hao): this function is not thread-safe now. Args: comm_key (str): the key to query the communicator cache. device_list (List): a list of GPU devices of the current process that participates into the collective. Returns: communicator: the NCCL communicator corresponded to the devices. """ if not comm_key: raise RuntimeError("Got empty communicator key.") for d in device_list: self._used_gpu_indices.add(d) # TODO(Hao): lock the _dev_comm_map here. if comm_key in self._dev_comm_map: return self._dev_comm_map[comm_key] group_key = self._generate_group_key(comm_key) if self.rank == 0: nccl_uid = self._generate_nccl_uid(group_key) else: rendezvous = Rendezvous(group_key) rendezvous.meet() nccl_uid = rendezvous.get_nccl_id() # Now create the communicators actual_world_size = len(device_list) * self.world_size comms = [None] * len(device_list) streams = [None] * len(device_list) events = [None] * len(device_list) nccl_util.groupStart() for i, device in enumerate(device_list): actual_rank = self.rank * len(device_list) + i with nccl_util.Device(device): comms[i] = nccl_util.create_nccl_communicator( actual_world_size, nccl_uid, actual_rank ) # request a stream from the pool # note the device_idx is absolute index. streams[i] = get_stream_pool(device).get_stream() # TODO(Fu): double check the parameters events[i] = cupy.cuda.Event() nccl_util.groupEnd() # TODO(Fu): lock self._dev_comm_map[comm_key] = comms self._dev_streams_map[comm_key] = streams self._dev_event_map[comm_key] = events return comms
def _init_once(self): """Initialize the stream pool only for once.""" with nccl_util.Device(self.device_idx): for i in range(NCCL_STREAM_POOL_SIZE): # this is the only place where self._pool will be written. if ENV.NCCL_USE_MULTISTREAM.val: logger.debug("NCCL multistream enabled.") self._pool[i] = cupy.cuda.Stream(null=False, non_blocking=False) else: logger.debug("NCCL multistream disabled.") self._pool[i] = cupy.cuda.Stream.null self._init_flag = True
def barrier(self, barrier_options=BarrierOptions()): """Blocks until all processes reach this barrier. Args: barrier_options: barrier options. Returns: None """ # Get the device list. if self._used_gpu_indices: devices = list(self._used_gpu_indices) else: devices = list(range(nccl_util.get_num_gpus())) barrier_tensors = [None] * len(devices) for i, d in enumerate(devices): with nccl_util.Device(d): barrier_tensors[i] = cupy.array([1]) self.allreduce(barrier_tensors)
def _flatten_for_scatter_gather(tensor_list, copy=False): """Flatten the tensor for gather/scatter operations. Args: tensor_list: the list of tensors to be scattered/gathered. copy: whether the copy the tensors in tensor_list into the buffer. Returns: The flattened tensor buffer. """ if not tensor_list: raise RuntimeError("Received an empty list.") t = tensor_list[0] # note we need a cupy dtype here. dtype = nccl_util.get_cupy_tensor_dtype(t) buffer_shape = [len(tensor_list)] + nccl_util.get_tensor_shape(t) device = nccl_util.get_tensor_device(t) with nccl_util.Device(device): buffer = cupy.empty(buffer_shape, dtype=dtype) if copy: for i, tensor in enumerate(tensor_list): nccl_util.copy_tensor(buffer[i], tensor) return buffer