def _point2point(self, tensors, p2p_fn, peer_rank: int, peer_gpu_idx: int): """A method to encapsulate all peer-to-peer calls (i.e., send/recv). Args: tensors: the tensor to send or receive. p2p_fn: the p2p function call. peer_rank (int): the rank of the peer process. peer_gpu_idx (int): the index of the gpu on the peer process. Returns: None """ # check send/recv availability. if nccl_util.get_nccl_runtime_version() < 2704: raise RuntimeError("P2p send/recv requires NCCL >= 2.7.4. " "Got '{}'.".format( nccl_util.get_nccl_runtime_version())) _check_gpu_tensors(tensors) # we currently only support single device to single device send/recv. assert len(tensors) == 1 my_gpu_idx = nccl_util.get_tensor_device(tensors[0]) comm_key = _get_comm_key_send_recv(self.rank, my_gpu_idx, peer_rank, peer_gpu_idx) comms = self._get_nccl_p2p_communicator(comm_key, my_gpu_idx, peer_rank, peer_gpu_idx) streams = self._dev_streams_map[comm_key] # TODO(Hao): sync streams and events self._sync_streams() # We have made sure that self.rank != peer_rank during API check. peer_p2p_rank = 0 if self.rank > peer_rank else 1 for i, tensor in enumerate(tensors): p2p_fn(tensors[i], comms[i], streams[i], peer_p2p_rank)
def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) # communicator and stream cache. # TODO (Hao): we need a lock here... self._dev_comm_map = {} self._dev_streams_map = {} # record the used GPU IDs. self._used_gpu_indices = set() if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4")
def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) self._nccl_uid = None # TODO(Hao): change this to a be a cache self._nccl_comm = None if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") # TODO(Hao): check version here if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4") self._rendezvous = Rendezvous(self.group_name) self._rendezvous.meet() # Setup the nccl uid using the store self._init_nccl_unique_id() # Setup a tensor for barrier calls self._barrier_tensor = cupy.array([1])