def _init_ranks(self): my_ranks = _communication_utility.init_ranks(self.mpi_comm) assert my_ranks[0] == self.mpi_comm.rank self._intra_rank = my_ranks[1] self._intra_size = my_ranks[2] self._inter_rank = my_ranks[3] self._inter_size = my_ranks[4]
def _init_ranks(self): my_ranks = _communication_utility.init_ranks(self.mpi_comm) assert my_ranks[0] == self.mpi_comm.rank self._intra_rank = my_ranks[1] self._intra_size = my_ranks[2] self._inter_rank = my_ranks[3] self._inter_size = my_ranks[4]
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') communicator = param.communicator_class(mpi_comm) communicator.set_config('batched_copy', param.batched_copy) value = communicator.get_config('batched_copy') assert param.batched_copy == value with pytest.raises(ValueError): communicator.set_config('blah blah blah') if param.communicator_class is PureNcclCommunicator: communicator.set_config('allreduce_grad_dtype', param.allreduce_grad_dtype) value = communicator.get_config('allreduce_grad_dtype') assert param.allreduce_grad_dtype == value if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def test_deprecation_single(): ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') with chainer.testing.assert_warns(DeprecationWarning): chainermn.create_communicator('single_node')
def test_deprecation_single(): ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') with chainer.testing.assert_warns(DeprecationWarning): chainermn.create_communicator('single_node')
def setUp(self): self.mpi_comm = mpi4py.MPI.COMM_WORLD if not self.multi_node: ranks = _communication_utility.init_ranks(self.mpi_comm) inter_size = ranks[4] if inter_size > 1: raise nose.plugins.skip.SkipTest() self.communicator = self.communicator_class(self.mpi_comm) if hasattr(self.communicator, 'intra_rank'): chainer.cuda.get_device(self.communicator.intra_rank).use()
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device(communicator.intra_rank).use() return communicator
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') if param.communicator_class is PureNcclCommunicator: communicator = param.communicator_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') if param.communicator_class is PureNcclCommunicator: communicator = param.communicator_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def check_allreduce_grad_mixed_dtype(param, model, use_gpu): # Checks the actual allreduce communication is performed # in the correct data type (FP16 or FP32) comm_class = param.communicator_class if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if comm_class is PureNcclCommunicator: communicator = comm_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = comm_class(mpi_comm) mpi_comm.barrier() # answer type: see the document of `create_communicator` global_dtype = param.global_dtype allreduce_dtype = param.allreduce_grad_dtype # assert test configuration. assert chainer.get_dtype() == global_dtype answer_dtype = None if allreduce_dtype == np.float16: answer_dtype = np.float16 elif allreduce_dtype == np.float32: answer_dtype = np.float32 else: if global_dtype == np.float32: answer_dtype = np.float32 else: answer_dtype = np.float16 if use_gpu: model.to_gpu() model.a.W.grad[:] = communicator.rank model.b.W.grad[:] = communicator.rank + 1 model.c.b.grad[:] = communicator.rank + 2 if isinstance(communicator, PureNcclCommunicator): communicator._init_comms() with mock.patch.object(communicator, 'nccl_comm', wraps=communicator.nccl_comm) as mc: answer_dtype = _communication_utility._get_nccl_type_id( answer_dtype) communicator.allreduce_grad(model) # dtype that was used in the actual communication, # which is nccl_comm.allReduce call_args = mc.allReduce.call_args[0] actual_dtype = call_args[3] assert answer_dtype == actual_dtype else: # For other MPI-based communicators, # all communication should happen in FP32 as of now, so # here we just check the results are correct for # 16-32 mixed models. communicator.allreduce_grad(model) base = (communicator.size - 1.0) / 2 chainer.testing.assert_allclose(model.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose(model.b.W.grad, (base + 1) * np.ones((4, 3))) mpi_comm.barrier() destroy_communicator(communicator)
def check_allreduce_grad_mixed_dtype(param, model, use_gpu): # Checks the actual allreduce communication is performed # in the correct data type (FP16 or FP32) comm_class = param.communicator_class if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if comm_class is PureNcclCommunicator: communicator = comm_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = comm_class(mpi_comm) mpi_comm.barrier() # answer type: see the document of `create_communicator` global_dtype = param.global_dtype allreduce_dtype = param.allreduce_grad_dtype # assert test configuration. assert chainer.get_dtype() == global_dtype answer_dtype = None if allreduce_dtype == np.float16: answer_dtype = np.float16 elif allreduce_dtype == np.float32: answer_dtype = np.float32 else: if global_dtype == np.float32: answer_dtype = np.float32 else: answer_dtype = np.float16 if use_gpu: model.to_gpu() model.a.W.grad[:] = communicator.rank model.b.W.grad[:] = communicator.rank + 1 model.c.b.grad[:] = communicator.rank + 2 if isinstance(communicator, PureNcclCommunicator): communicator._init_comms() with mock.patch.object(communicator, 'nccl_comm', wraps=communicator.nccl_comm) as mc: answer_dtype = _communication_utility._get_nccl_type_id( answer_dtype) communicator.allreduce_grad(model) # dtype that was used in the actual communication, # which is nccl_comm.allReduce call_args = mc.allReduce.call_args[0] actual_dtype = call_args[3] assert answer_dtype == actual_dtype else: # For other MPI-based communicators, # all communication should happen in FP32 as of now, so # here we just check the results are correct for # 16-32 mixed models. communicator.allreduce_grad(model) base = (communicator.size - 1.0) / 2 chainer.testing.assert_allclose(model.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose(model.b.W.grad, (base + 1) * np.ones((4, 3))) mpi_comm.barrier() destroy_communicator(communicator)