示例#1
0
def _check_gpu_tensors(tensors):
    """Check all tensors are distributed on different GPUs."""
    if not tensors or not isinstance(tensors, list):
        raise RuntimeError("'tensors' must be a nonempty list.")
    if len(tensors) > nccl_util.get_num_gpus():
        raise RuntimeError("Tensor list cannot be larger than the number"
                           "of available GPUs. Got {} > {}.".format(
                               len(tensors), nccl_util.get_num_gpus()))
    t0 = tensors[0]
    dt = nccl_util.get_nccl_tensor_dtype(t0)
    s = nccl_util.get_tensor_shape(t0)
    d = nccl_util.get_tensor_device(t0)
    for i, t in enumerate(tensors):
        if i == 0:
            continue
        # We need to check the following:
        # (1) tensor is cuda (already checked during API)
        # (2) tensor dtype
        # (3) tensor shape match
        # (4) each tensor is on a different GPU
        dtype = nccl_util.get_nccl_tensor_dtype(t)
        if dt != dtype:
            raise RuntimeError(
                "Tensors must have identical dtype. Got: '{}'.".format(dtype))
        shape = nccl_util.get_tensor_shape(t)
        if s != shape:
            raise RuntimeError(
                "Tensor must have identical shape. Got: '{}'.".format(shape))
        device = nccl_util.get_tensor_device(t)
        if device == d:
            raise RuntimeError("Tensor must be on distinct GPUs.")
示例#2
0
    def allgather(self,
                  tensor_list,
                  tensor,
                  allgather_options=AllGatherOptions()):
        """Allgather tensors across the group into a list of  tensors.

        Args:
            tensor_list: the tensor list to store the results.
            tensor: the tensor to be allgather-ed across the group.
            allgather_options: allgather options.

        Returns:
            None
        """

        _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list)
        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        send_ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        flattened = _flatten_for_scatter_gather(tensor_list, copy=False)
        recv_ptr = nccl_util.get_tensor_ptr(flattened)
        comm.allGather(send_ptr, recv_ptr, n_elems, dtype, stream.ptr)
        for i, t in enumerate(tensor_list):
            nccl_util.copy_tensor(t, flattened[i])
示例#3
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.reduce(nccl_util.get_tensor_ptr(input_tensor),
                 nccl_util.get_tensor_ptr(output_tensor),
                 nccl_util.get_tensor_n_elements(input_tensor),
                 nccl_util.get_nccl_tensor_dtype(input_tensor),
                 nccl_util.get_nccl_reduce_op(reduce_options.reduceOp),
                 root_rank, stream.ptr)
示例#4
0
    def reducescatter(self,
                      tensor,
                      tensor_list,
                      reducescatter_options=ReduceScatterOptions()):
        """Reducescatter a list of tensors across the group.

        Args:
            tensor: the output after reducescatter (could be unspecified).
            tensor_list: the list of tensor to be reduce and scattered.
            reducescatter_options: reducescatter options.

        Returns:
            None
        """
        _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list)

        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()
        dtype = nccl_util.get_nccl_tensor_dtype(tensor_list[0])
        n_elems = nccl_util.get_tensor_n_elements(tensor_list[0])
        reduce_op = nccl_util.get_nccl_reduce_op(
            reducescatter_options.reduceOp)

        # get the send_ptr
        flattened = _flatten_for_scatter_gather(tensor_list, copy=True)
        send_ptr = nccl_util.get_tensor_ptr(flattened)
        recv_ptr = nccl_util.get_tensor_ptr(tensor)
        comm.reduceScatter(send_ptr, recv_ptr, n_elems, dtype, reduce_op,
                           stream.ptr)
示例#5
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.broadcast(
         nccl_util.get_tensor_ptr(input_tensor),
         nccl_util.get_tensor_ptr(output_tensor),
         nccl_util.get_tensor_n_elements(input_tensor),
         nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank,
         stream.ptr)
示例#6
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.reduceScatter(
         nccl_util.get_tensor_ptr(input_tensor),
         nccl_util.get_tensor_ptr(output_tensor),
         nccl_util.get_tensor_n_elements(output_tensor),
         nccl_util.get_nccl_tensor_dtype(output_tensor),
         nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp),
         stream.ptr)
示例#7
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.allGather(
         nccl_util.get_tensor_ptr(input_tensor),
         nccl_util.get_tensor_ptr(output_tensor),
         nccl_util.get_tensor_n_elements(input_tensor),
         nccl_util.get_nccl_tensor_dtype(input_tensor),
         stream.ptr,
     )
示例#8
0
 def p2p_fn(tensor, comm, stream, peer):
     comm.recv(
         nccl_util.get_tensor_ptr(tensor),
         recv_options.n_elements if recv_options.n_elements > 0 else
         nccl_util.get_tensor_n_elements(tensor),
         nccl_util.get_nccl_tensor_dtype(tensor),
         peer,
         stream.ptr,
     )
示例#9
0
def _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list):
    """Check the compatibility between tensor input and tensor list inputs."""
    if not tensor_list:
        raise RuntimeError("Got empty list of tensors.")
    dtype = nccl_util.get_nccl_tensor_dtype(tensor)
    shape = nccl_util.get_tensor_shape(tensor)
    for t in tensor_list:
        # check dtype
        dt = nccl_util.get_nccl_tensor_dtype(t)
        if dt != dtype:
            raise RuntimeError("All tensor operands to scatter/gather must "
                               "have the same dtype. Got '{}' and '{}'"
                               "".format(dt, dtype))
        # Note: typically CCL libraries only requires they have the same
        # number of elements;
        # Here we make it more strict -- we require exact shape match.
        if nccl_util.get_tensor_shape(t) != shape:
            raise RuntimeError("All tensor operands to scatter/gather must "
                               "have the same shape.")
示例#10
0
def _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists):
    """Check the compatibility between tensor input and tensor list input."""
    if not tensors or not isinstance(tensors, list):
        raise RuntimeError("The first argument 'tensors' expects a list of tensors.")
    if not tensor_lists or not isinstance(tensor_lists, list):
        raise RuntimeError(
            "The second argument 'tensor_lists' " "expects a list of tensor list."
        )
    dtype = nccl_util.get_nccl_tensor_dtype(tensors[0])
    shape = nccl_util.get_tensor_shape(tensors[0])
    for i, tensor_list in enumerate(tensor_lists):
        # check all tensor in `tensors` match.
        dt = nccl_util.get_nccl_tensor_dtype(tensors[i])
        if dt != dtype:
            raise RuntimeError(
                "All tensor operands to scatter/gather must "
                "have the same dtype. Got '{}' and '{}'.".format(dt, dtype)
            )
        # Note: typically CCL libraries only requires they have the same
        # number of elements; Here we make it more strict -- we require
        # exact shape match.
        s = nccl_util.get_tensor_shape(tensors[i])
        if s != shape:
            raise RuntimeError(
                "All tensor operands to scatter/gather must "
                "have the same shape. Got '{}' and '{}'.".format(s, shape)
            )
        # check all tensors in `tensor_lists` match.
        for t in tensor_lists[i]:
            # check dtype
            dt = nccl_util.get_nccl_tensor_dtype(t)
            if dt != dtype:
                raise RuntimeError(
                    "All tensor operands to scatter/gather must "
                    "have the same dtype. Got '{}' and '{}'.".format(dt, dtype)
                )
            s = nccl_util.get_tensor_shape(t)
            if s != shape:
                raise RuntimeError(
                    "All tensor operands to scatter/gather must "
                    "have the same shape. Got '{}' and '{}'.".format(s, shape)
                )
示例#11
0
    def broadcast(self, tensor, broadcast_options=BroadcastOptions()):
        """Broadcast tensor to all other processes following options.

        Args:
            tensor: the tensor to be broadcasted.
            broadcast_options: broadcast options.

        Returns:
            None
        """
        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        # in-place broadcast
        comm.broadcast(ptr, ptr, n_elems, dtype, broadcast_options.root_rank,
                       stream.ptr)
示例#12
0
    def reduce(self, tensor, reduce_options=ReduceOptions()):
        """Reduce tensor to a destination process following options.

        Args:
            tensor: the tensor to be reduced.
            reduce_options: reduce options

        Returns:
            None
        """
        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        reduce_op = nccl_util.get_nccl_reduce_op(reduce_options.reduceOp)

        # in-place reduce
        comm.reduce(ptr, ptr, n_elems, dtype, reduce_op,
                    reduce_options.root_rank, stream.ptr)
示例#13
0
    def allreduce(self, tensor, allreduce_options=AllReduceOptions()):
        """AllReduce the tensor across the collective group following options.

        Args:
            tensor: the tensor to be reduced, each tensor locates on a GPU
            allreduce_options:

        Returns:
        """
        # obtain the communicator
        comm = self._get_nccl_communicator()
        # obtain the stream: using default stream by now
        # TODO(Hao): implement a simple stream manager here
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        reduce_op = nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp)

        # in-place allreduce
        comm.allReduce(ptr, ptr, n_elems, dtype, reduce_op, stream.ptr)
示例#14
0
 def p2p_fn(tensor, comm, stream, peer):
     comm.recv(nccl_util.get_tensor_ptr(tensor),
               nccl_util.get_tensor_n_elements(tensor),
               nccl_util.get_nccl_tensor_dtype(tensor), peer,
               stream.ptr)