def allgather(self, tensor_list, tensor, allgather_options=AllGatherOptions()): """Allgather tensors across the group into a list of tensors. Args: tensor_list: the tensor list to store the results. tensor: the tensor to be allgather-ed across the group. allgather_options: allgather options. Returns: None """ _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) send_ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) flattened = _flatten_for_scatter_gather(tensor_list, copy=False) recv_ptr = nccl_util.get_tensor_ptr(flattened) comm.allGather(send_ptr, recv_ptr, n_elems, dtype, stream.ptr) for i, t in enumerate(tensor_list): nccl_util.copy_tensor(t, flattened[i])
def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduce(nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), nccl_util.get_nccl_reduce_op(reduce_options.reduceOp), root_rank, stream.ptr)
def reducescatter(self, tensor, tensor_list, reducescatter_options=ReduceScatterOptions()): """Reducescatter a list of tensors across the group. Args: tensor: the output after reducescatter (could be unspecified). tensor_list: the list of tensor to be reduce and scattered. reducescatter_options: reducescatter options. Returns: None """ _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor_list[0]) n_elems = nccl_util.get_tensor_n_elements(tensor_list[0]) reduce_op = nccl_util.get_nccl_reduce_op( reducescatter_options.reduceOp) # get the send_ptr flattened = _flatten_for_scatter_gather(tensor_list, copy=True) send_ptr = nccl_util.get_tensor_ptr(flattened) recv_ptr = nccl_util.get_tensor_ptr(tensor) comm.reduceScatter(send_ptr, recv_ptr, n_elems, dtype, reduce_op, stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.broadcast( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank, stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduceScatter( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(output_tensor), nccl_util.get_nccl_tensor_dtype(output_tensor), nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp), stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.allGather( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), stream.ptr, )
def p2p_fn(tensor, comm, stream, peer): comm.recv( nccl_util.get_tensor_ptr(tensor), recv_options.n_elements if recv_options.n_elements > 0 else nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr, )
def broadcast(self, tensor, broadcast_options=BroadcastOptions()): """Broadcast tensor to all other processes following options. Args: tensor: the tensor to be broadcasted. broadcast_options: broadcast options. Returns: None """ comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) # in-place broadcast comm.broadcast(ptr, ptr, n_elems, dtype, broadcast_options.root_rank, stream.ptr)
def reduce(self, tensor, reduce_options=ReduceOptions()): """Reduce tensor to a destination process following options. Args: tensor: the tensor to be reduced. reduce_options: reduce options Returns: None """ comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) reduce_op = nccl_util.get_nccl_reduce_op(reduce_options.reduceOp) # in-place reduce comm.reduce(ptr, ptr, n_elems, dtype, reduce_op, reduce_options.root_rank, stream.ptr)
def allreduce(self, tensor, allreduce_options=AllReduceOptions()): """AllReduce the tensor across the collective group following options. Args: tensor: the tensor to be reduced, each tensor locates on a GPU allreduce_options: Returns: """ # obtain the communicator comm = self._get_nccl_communicator() # obtain the stream: using default stream by now # TODO(Hao): implement a simple stream manager here stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) reduce_op = nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp) # in-place allreduce comm.allReduce(ptr, ptr, n_elems, dtype, reduce_op, stream.ptr)
def p2p_fn(tensor, comm, stream, peer): comm.recv(nccl_util.get_tensor_ptr(tensor), nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr)