def _check_gpu_tensors(tensors): """Check all tensors are distributed on different GPUs.""" if not tensors or not isinstance(tensors, list): raise RuntimeError("'tensors' must be a nonempty list.") if len(tensors) > nccl_util.get_num_gpus(): raise RuntimeError("Tensor list cannot be larger than the number" "of available GPUs. Got {} > {}.".format( len(tensors), nccl_util.get_num_gpus())) t0 = tensors[0] dt = nccl_util.get_nccl_tensor_dtype(t0) s = nccl_util.get_tensor_shape(t0) d = nccl_util.get_tensor_device(t0) for i, t in enumerate(tensors): if i == 0: continue # We need to check the following: # (1) tensor is cuda (already checked during API) # (2) tensor dtype # (3) tensor shape match # (4) each tensor is on a different GPU dtype = nccl_util.get_nccl_tensor_dtype(t) if dt != dtype: raise RuntimeError( "Tensors must have identical dtype. Got: '{}'.".format(dtype)) shape = nccl_util.get_tensor_shape(t) if s != shape: raise RuntimeError( "Tensor must have identical shape. Got: '{}'.".format(shape)) device = nccl_util.get_tensor_device(t) if device == d: raise RuntimeError("Tensor must be on distinct GPUs.")
def allgather(self, tensor_list, tensor, allgather_options=AllGatherOptions()): """Allgather tensors across the group into a list of tensors. Args: tensor_list: the tensor list to store the results. tensor: the tensor to be allgather-ed across the group. allgather_options: allgather options. Returns: None """ _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) send_ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) flattened = _flatten_for_scatter_gather(tensor_list, copy=False) recv_ptr = nccl_util.get_tensor_ptr(flattened) comm.allGather(send_ptr, recv_ptr, n_elems, dtype, stream.ptr) for i, t in enumerate(tensor_list): nccl_util.copy_tensor(t, flattened[i])
def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduce(nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), nccl_util.get_nccl_reduce_op(reduce_options.reduceOp), root_rank, stream.ptr)
def reducescatter(self, tensor, tensor_list, reducescatter_options=ReduceScatterOptions()): """Reducescatter a list of tensors across the group. Args: tensor: the output after reducescatter (could be unspecified). tensor_list: the list of tensor to be reduce and scattered. reducescatter_options: reducescatter options. Returns: None """ _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor_list[0]) n_elems = nccl_util.get_tensor_n_elements(tensor_list[0]) reduce_op = nccl_util.get_nccl_reduce_op( reducescatter_options.reduceOp) # get the send_ptr flattened = _flatten_for_scatter_gather(tensor_list, copy=True) send_ptr = nccl_util.get_tensor_ptr(flattened) recv_ptr = nccl_util.get_tensor_ptr(tensor) comm.reduceScatter(send_ptr, recv_ptr, n_elems, dtype, reduce_op, stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.broadcast( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank, stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduceScatter( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(output_tensor), nccl_util.get_nccl_tensor_dtype(output_tensor), nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp), stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.allGather( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), stream.ptr, )
def p2p_fn(tensor, comm, stream, peer): comm.recv( nccl_util.get_tensor_ptr(tensor), recv_options.n_elements if recv_options.n_elements > 0 else nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr, )
def _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list): """Check the compatibility between tensor input and tensor list inputs.""" if not tensor_list: raise RuntimeError("Got empty list of tensors.") dtype = nccl_util.get_nccl_tensor_dtype(tensor) shape = nccl_util.get_tensor_shape(tensor) for t in tensor_list: # check dtype dt = nccl_util.get_nccl_tensor_dtype(t) if dt != dtype: raise RuntimeError("All tensor operands to scatter/gather must " "have the same dtype. Got '{}' and '{}'" "".format(dt, dtype)) # Note: typically CCL libraries only requires they have the same # number of elements; # Here we make it more strict -- we require exact shape match. if nccl_util.get_tensor_shape(t) != shape: raise RuntimeError("All tensor operands to scatter/gather must " "have the same shape.")
def _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists): """Check the compatibility between tensor input and tensor list input.""" if not tensors or not isinstance(tensors, list): raise RuntimeError("The first argument 'tensors' expects a list of tensors.") if not tensor_lists or not isinstance(tensor_lists, list): raise RuntimeError( "The second argument 'tensor_lists' " "expects a list of tensor list." ) dtype = nccl_util.get_nccl_tensor_dtype(tensors[0]) shape = nccl_util.get_tensor_shape(tensors[0]) for i, tensor_list in enumerate(tensor_lists): # check all tensor in `tensors` match. dt = nccl_util.get_nccl_tensor_dtype(tensors[i]) if dt != dtype: raise RuntimeError( "All tensor operands to scatter/gather must " "have the same dtype. Got '{}' and '{}'.".format(dt, dtype) ) # Note: typically CCL libraries only requires they have the same # number of elements; Here we make it more strict -- we require # exact shape match. s = nccl_util.get_tensor_shape(tensors[i]) if s != shape: raise RuntimeError( "All tensor operands to scatter/gather must " "have the same shape. Got '{}' and '{}'.".format(s, shape) ) # check all tensors in `tensor_lists` match. for t in tensor_lists[i]: # check dtype dt = nccl_util.get_nccl_tensor_dtype(t) if dt != dtype: raise RuntimeError( "All tensor operands to scatter/gather must " "have the same dtype. Got '{}' and '{}'.".format(dt, dtype) ) s = nccl_util.get_tensor_shape(t) if s != shape: raise RuntimeError( "All tensor operands to scatter/gather must " "have the same shape. Got '{}' and '{}'.".format(s, shape) )
def broadcast(self, tensor, broadcast_options=BroadcastOptions()): """Broadcast tensor to all other processes following options. Args: tensor: the tensor to be broadcasted. broadcast_options: broadcast options. Returns: None """ comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) # in-place broadcast comm.broadcast(ptr, ptr, n_elems, dtype, broadcast_options.root_rank, stream.ptr)
def reduce(self, tensor, reduce_options=ReduceOptions()): """Reduce tensor to a destination process following options. Args: tensor: the tensor to be reduced. reduce_options: reduce options Returns: None """ comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) reduce_op = nccl_util.get_nccl_reduce_op(reduce_options.reduceOp) # in-place reduce comm.reduce(ptr, ptr, n_elems, dtype, reduce_op, reduce_options.root_rank, stream.ptr)
def allreduce(self, tensor, allreduce_options=AllReduceOptions()): """AllReduce the tensor across the collective group following options. Args: tensor: the tensor to be reduced, each tensor locates on a GPU allreduce_options: Returns: """ # obtain the communicator comm = self._get_nccl_communicator() # obtain the stream: using default stream by now # TODO(Hao): implement a simple stream manager here stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) reduce_op = nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp) # in-place allreduce comm.allReduce(ptr, ptr, n_elems, dtype, reduce_op, stream.ptr)
def p2p_fn(tensor, comm, stream, peer): comm.recv(nccl_util.get_tensor_ptr(tensor), nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr)