def __init__(self, mpi_comm,
                 batched_copy=False):

        super(NonCudaAwareCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError(
                'NCCL is not available. '
                'Please confirm that NCCL is enabled in CuPy.'
            )
        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.inter_mpi_comm = None
        self.intra_nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
        self.cpu_buffer_a = _memory_utility.HostPinnedMemory()
        self.cpu_buffer_b = _memory_utility.HostPinnedMemory()

        self.batched_copy = batched_copy
Exemplo n.º 2
0
    def __init__(self, mpi_comm, allreduce_grad_dtype=None):
        super(PureNcclCommunicator, self).__init__(mpi_comm, True)
        if nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')
        self._init_ranks()

        self.inter_mpi_comm = None
        self.intra_mpi_comm = None
        self.intra_nccl_comm = None
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_allreduce_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_allreduce_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
 def __init__(self, mpi_comm, dynamic=False, debug=False):
     super(PureNCCLCommunicator, self).__init__(mpi_comm, True, dynamic,
                                                debug)
     if nccl.get_version() < 2000:
         raise RuntimeError(
             'PureNcclCommunicator is only supported on NCCL 2.0+')
     self._init_ranks()
Exemplo n.º 4
0
    def __init__(self, mpi_comm, allreduce_grad_dtype=None):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available or nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
Exemplo n.º 5
0
    def __init__(self, mpi_comm):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but NCCL is not available.')
        if nccl.get_build_version() < 2000:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but found {}.'.format(
                                   nccl.get_build_version()))

        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        with self.config_scope():
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.params_data = None
Exemplo n.º 6
0
    def __init__(self, mpi_comm, allreduce_grad_dtype=None):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available or nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError(
                    'allreduce_grad_dtype must be'
                    'numpy.float16, numpy.float32,'
                    'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
Exemplo n.º 7
0
def create_communicator(communicator_class, mpi_comm, use_gpu):
    if PureNcclCommunicator == communicator_class:
        use_nccl = True
    else:
        use_nccl = False

    if use_gpu and not use_nccl and nccl.get_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')
    communicator = communicator_class(mpi_comm)
    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
 def setup_gpu(self, device=None):
     if nccl.get_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.comm = chainermn.create_communicator('pure_nccl')
     device = self.comm.intra_rank
     chainer.cuda.get_device_from_id(device).use()
     self.target = DynamicExampleModel()
     self.target.to_gpu()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
 def setup_gpu(self, device=None):
     if nccl.get_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.comm = chainermn.create_communicator('pure_nccl')
     device = self.comm.intra_rank
     chainer.cuda.get_device_from_id(device).use()
     self.target = DynamicExampleModel()
     self.target.to_gpu()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
Exemplo n.º 10
0
    def __init__(self, mpi_comm):
        super(PureNcclCommunicator, self).__init__(mpi_comm, True)
        if nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')
        self._init_ranks()

        self.inter_mpi_comm = None
        self.intra_mpi_comm = None
        self.intra_nccl_comm = None
        self.nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Exemplo n.º 11
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device(communicator.intra_rank).use()

    return communicator
Exemplo n.º 12
0
    def setUp(self):
        self.mpi_comm = mpi4py.MPI.COMM_WORLD

        if not self.multi_node:
            ranks = _communication_utility.init_ranks(self.mpi_comm)
            inter_size = ranks[4]
            if inter_size > 1:
                raise nose.plugins.skip.SkipTest()
        if hasattr(self, 'nccl1') and not self.nccl1 \
           and nccl.get_version() < 2000:
            raise nose.plugins.skip.SkipTest()

        self.communicator = self.communicator_class(self.mpi_comm)

        if hasattr(self.communicator, 'intra_rank'):
            chainer.cuda.get_device(self.communicator.intra_rank).use()
def init_comms(mpi_comm, intra_rank, intra_size, inter_rank, use_nccl=True):
    intra_mpi_comm = mpi_comm.Split(inter_rank, intra_rank)
    inter_mpi_comm = mpi_comm.Split(intra_rank, inter_rank)
    if use_nccl:
        from chainermn import nccl
        intra_nccl_comm_id = intra_mpi_comm.bcast(nccl.get_unique_id())
        intra_nccl_comm = nccl.NcclCommunicator(
            intra_size, intra_nccl_comm_id, intra_rank)
        if nccl.get_version() >= 2000:
            nccl_comm_id = mpi_comm.bcast(nccl.get_unique_id())
            nccl_comm = nccl.NcclCommunicator(
                mpi_comm.size, nccl_comm_id, mpi_comm.rank)
        else:
            nccl_comm = None
        return intra_mpi_comm, inter_mpi_comm, intra_nccl_comm, nccl_comm
    else:
        return intra_mpi_comm, inter_mpi_comm
Exemplo n.º 14
0
    def __init__(self, mpi_comm):
        super(HierarchicalCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError(
                'NCCL is not available. '
                'Please confirm that NCCL is enabled in CuPy.'
            )
        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.inter_mpi_comm = None
        self.intra_nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Exemplo n.º 15
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    if param.communicator_class is PureNcclCommunicator:
        communicator = param.communicator_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Exemplo n.º 16
0
    def __init__(self, mpi_comm):
        super(SingleNodeCommunicator, self).__init__(mpi_comm)

        if self.inter_size != 1:
            raise ValueError('SingleNodeCommunicator cannot be used under '
                             'multi-node settings')
        if not nccl._available:
            raise RuntimeError('NCCL is not available. '
                               'Please confirm that NCCL is enabled in CuPy.')
        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.intra_nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Exemplo n.º 17
0
    def __init__(self,
                 mpi_comm,
                 allreduce_grad_dtype=None,
                 batched_copy=False):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but NCCL is not available.')
        if nccl.get_build_version() < 2000:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but found {}.'.format(
                                   nccl.get_build_version()))

        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.batched_copy = batched_copy
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.params_data = None
Exemplo n.º 18
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    if param.allreduce_grad_dtype is not None:
        dtype = param.allreduce_grad_dtype
        communicator = \
            param.communicator_class(mpi_comm,
                                     allreduce_grad_dtype=dtype)
    else:
        communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Exemplo n.º 19
0
 def setUp(self):
     if nccl.get_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.mpi_comm = mpi4py.MPI.COMM_WORLD
Exemplo n.º 20
0
 def setUp(self):
     if nccl.get_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.mpi_comm = mpi4py.MPI.COMM_WORLD