def _check_gpu_tensors(tensors): """Check all tensors are distributed on different GPUs.""" if not tensors or not isinstance(tensors, list): raise RuntimeError("'tensors' must be a nonempty list.") if len(tensors) > nccl_util.get_num_gpus(): raise RuntimeError("Tensor list cannot be larger than the number" "of available GPUs. Got {} > {}.".format( len(tensors), nccl_util.get_num_gpus())) t0 = tensors[0] dt = nccl_util.get_nccl_tensor_dtype(t0) s = nccl_util.get_tensor_shape(t0) d = nccl_util.get_tensor_device(t0) for i, t in enumerate(tensors): if i == 0: continue # We need to check the following: # (1) tensor is cuda (already checked during API) # (2) tensor dtype # (3) tensor shape match # (4) each tensor is on a different GPU dtype = nccl_util.get_nccl_tensor_dtype(t) if dt != dtype: raise RuntimeError( "Tensors must have identical dtype. Got: '{}'.".format(dtype)) shape = nccl_util.get_tensor_shape(t) if s != shape: raise RuntimeError( "Tensor must have identical shape. Got: '{}'.".format(shape)) device = nccl_util.get_tensor_device(t) if device == d: raise RuntimeError("Tensor must be on distinct GPUs.")
def barrier(self, barrier_options=BarrierOptions()): """Blocks until all processes reach this barrier. Args: barrier_options: barrier options. Returns: None """ # Get the device list. if self._used_gpu_indices: devices = list(self._used_gpu_indices) else: devices = list(range(nccl_util.get_num_gpus())) barrier_tensors = [None] * len(devices) for i, d in enumerate(devices): with nccl_util.Device(d): barrier_tensors[i] = cupy.array([1]) self.allreduce(barrier_tensors)
def report_num_gpus(self): n_gpus = get_num_gpus() return n_gpus