def __init__(self, mpi_comm, batched_copy=False): super(NonCudaAwareCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError( 'NCCL is not available. ' 'Please confirm that NCCL is enabled in CuPy.' ) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.inter_mpi_comm = None self.intra_nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() self.cpu_buffer_a = _memory_utility.HostPinnedMemory() self.cpu_buffer_b = _memory_utility.HostPinnedMemory() self.batched_copy = batched_copy
def __init__(self, mpi_comm, allreduce_grad_dtype=None): super(PureNcclCommunicator, self).__init__(mpi_comm, True) if nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') self._init_ranks() self.inter_mpi_comm = None self.intra_mpi_comm = None self.intra_nccl_comm = None self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_allreduce_buffer_a = _memory_utility.DeviceMemory() self.gpu_allreduce_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.div_by_size = None
def __init__(self, mpi_comm, dynamic=False, debug=False): super(PureNCCLCommunicator, self).__init__(mpi_comm, True, dynamic, debug) if nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') self._init_ranks()
def __init__(self, mpi_comm, allreduce_grad_dtype=None): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available or nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.div_by_size = None
def __init__(self, mpi_comm): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but NCCL is not available.') if nccl.get_build_version() < 2000: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but found {}.'.format( nccl.get_build_version())) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() with self.config_scope(): self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.params_data = None
def __init__(self, mpi_comm, allreduce_grad_dtype=None): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available or nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError( 'allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.div_by_size = None
def create_communicator(communicator_class, mpi_comm, use_gpu): if PureNcclCommunicator == communicator_class: use_nccl = True else: use_nccl = False if use_gpu and not use_nccl and nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') communicator = communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def setup_gpu(self, device=None): if nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') self.comm = chainermn.create_communicator('pure_nccl') device = self.comm.intra_rank chainer.cuda.get_device_from_id(device).use() self.target = DynamicExampleModel() self.target.to_gpu() self.target.a.W.data[:] = self.comm.rank self.target.b.W.data[:] = self.comm.rank + 1 self.target.a.W.grad[:] = 0 self.target.b.W.grad[:] = 0 self.actual_optimizer = chainer.GradientMethod() self.actual_optimizer.create_update_rule = mock.MagicMock
def __init__(self, mpi_comm): super(PureNcclCommunicator, self).__init__(mpi_comm, True) if nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') self._init_ranks() self.inter_mpi_comm = None self.intra_mpi_comm = None self.intra_nccl_comm = None self.nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device(communicator.intra_rank).use() return communicator
def setUp(self): self.mpi_comm = mpi4py.MPI.COMM_WORLD if not self.multi_node: ranks = _communication_utility.init_ranks(self.mpi_comm) inter_size = ranks[4] if inter_size > 1: raise nose.plugins.skip.SkipTest() if hasattr(self, 'nccl1') and not self.nccl1 \ and nccl.get_version() < 2000: raise nose.plugins.skip.SkipTest() self.communicator = self.communicator_class(self.mpi_comm) if hasattr(self.communicator, 'intra_rank'): chainer.cuda.get_device(self.communicator.intra_rank).use()
def init_comms(mpi_comm, intra_rank, intra_size, inter_rank, use_nccl=True): intra_mpi_comm = mpi_comm.Split(inter_rank, intra_rank) inter_mpi_comm = mpi_comm.Split(intra_rank, inter_rank) if use_nccl: from chainermn import nccl intra_nccl_comm_id = intra_mpi_comm.bcast(nccl.get_unique_id()) intra_nccl_comm = nccl.NcclCommunicator( intra_size, intra_nccl_comm_id, intra_rank) if nccl.get_version() >= 2000: nccl_comm_id = mpi_comm.bcast(nccl.get_unique_id()) nccl_comm = nccl.NcclCommunicator( mpi_comm.size, nccl_comm_id, mpi_comm.rank) else: nccl_comm = None return intra_mpi_comm, inter_mpi_comm, intra_nccl_comm, nccl_comm else: return intra_mpi_comm, inter_mpi_comm
def __init__(self, mpi_comm): super(HierarchicalCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError( 'NCCL is not available. ' 'Please confirm that NCCL is enabled in CuPy.' ) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.inter_mpi_comm = None self.intra_nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') if param.communicator_class is PureNcclCommunicator: communicator = param.communicator_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def __init__(self, mpi_comm): super(SingleNodeCommunicator, self).__init__(mpi_comm) if self.inter_size != 1: raise ValueError('SingleNodeCommunicator cannot be used under ' 'multi-node settings') if not nccl._available: raise RuntimeError('NCCL is not available. ' 'Please confirm that NCCL is enabled in CuPy.') if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.intra_nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def __init__(self, mpi_comm, allreduce_grad_dtype=None, batched_copy=False): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but NCCL is not available.') if nccl.get_build_version() < 2000: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but found {}.'.format( nccl.get_build_version())) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.batched_copy = batched_copy self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.params_data = None
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') if param.allreduce_grad_dtype is not None: dtype = param.allreduce_grad_dtype communicator = \ param.communicator_class(mpi_comm, allreduce_grad_dtype=dtype) else: communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def setUp(self): if nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') self.mpi_comm = mpi4py.MPI.COMM_WORLD