def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduce(nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), nccl_util.get_nccl_reduce_op(reduce_options.reduceOp), root_rank, stream.ptr)
def reducescatter(self, tensor, tensor_list, reducescatter_options=ReduceScatterOptions()): """Reducescatter a list of tensors across the group. Args: tensor: the output after reducescatter (could be unspecified). tensor_list: the list of tensor to be reduce and scattered. reducescatter_options: reducescatter options. Returns: None """ _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor_list[0]) n_elems = nccl_util.get_tensor_n_elements(tensor_list[0]) reduce_op = nccl_util.get_nccl_reduce_op( reducescatter_options.reduceOp) # get the send_ptr flattened = _flatten_for_scatter_gather(tensor_list, copy=True) send_ptr = nccl_util.get_tensor_ptr(flattened) recv_ptr = nccl_util.get_tensor_ptr(tensor) comm.reduceScatter(send_ptr, recv_ptr, n_elems, dtype, reduce_op, stream.ptr)
def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduceScatter( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(output_tensor), nccl_util.get_nccl_tensor_dtype(output_tensor), nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp), stream.ptr)
def reduce(self, tensor, reduce_options=ReduceOptions()): """Reduce tensor to a destination process following options. Args: tensor: the tensor to be reduced. reduce_options: reduce options Returns: None """ comm = self._get_nccl_communicator() stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) reduce_op = nccl_util.get_nccl_reduce_op(reduce_options.reduceOp) # in-place reduce comm.reduce(ptr, ptr, n_elems, dtype, reduce_op, reduce_options.root_rank, stream.ptr)
def allreduce(self, tensor, allreduce_options=AllReduceOptions()): """AllReduce the tensor across the collective group following options. Args: tensor: the tensor to be reduced, each tensor locates on a GPU allreduce_options: Returns: """ # obtain the communicator comm = self._get_nccl_communicator() # obtain the stream: using default stream by now # TODO(Hao): implement a simple stream manager here stream = self._get_cuda_stream() dtype = nccl_util.get_nccl_tensor_dtype(tensor) ptr = nccl_util.get_tensor_ptr(tensor) n_elems = nccl_util.get_tensor_n_elements(tensor) reduce_op = nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp) # in-place allreduce comm.allReduce(ptr, ptr, n_elems, dtype, reduce_op, stream.ptr)