def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def _allreduce_grad_async(self, model, stream): self._init_comms() params = _memory_utility.extract_params(model) grad_dtype = _get_param_grad_dtype(params[0]) if self.allreduce_grad_dtype is None: allreduce_grad_dtype = grad_dtype else: allreduce_grad_dtype = self.allreduce_grad_dtype n_elems = sum(param.grad.size for param in params) needs_sync = self._assign(grad_dtype, allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream) self.nccl_comm.allReduce(self.gpu_allreduce_buffer_a.ptr(), self.gpu_allreduce_buffer_b.ptr(), n_elems, _get_nccl_type_id(allreduce_grad_dtype), nccl.NCCL_SUM, stream.ptr) if self.div_by_size is None: self.div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(allreduce_grad_dtype.name), '{} y'.format(allreduce_grad_dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') self.div_by_size( self.gpu_allreduce_buffer_b.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_allreduce_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), stream=stream) self._unpack_params_from_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream)
def allreduce_grad(self, model, stream=None): self._init_comms() if stream is None: stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems = sum(param.grad.size for param in params) n_bytes = itemsize * n_elems self.gpu_buffer_a.assign(n_bytes) self.gpu_buffer_b.assign(n_bytes) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) if stream != chainer.cuda.Stream.null: chainer.cuda.Stream.null.synchronize() self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) if stream != chainer.cuda.Stream.null: stream.synchronize() ret = self.gpu_buffer_b.array(n_elems) * (1.0 / self.size) self.gpu_buffer_b.from_device(ret, n_bytes) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_a)
def allreduce_grad(self, model): self._init_comms() params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) self.mpi_comm.Allreduce( [self.gpu_buffer_a.buffer(n_bytes_total), mpi4py.MPI.FLOAT], [self.gpu_buffer_b.buffer(n_bytes_total), mpi4py.MPI.FLOAT]) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_a)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) self.intra_nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): for param in _memory_utility.extract_params(model): buf = _memory_utility.array_to_buffer_object(param.grad) self.mpi_comm.Allreduce(mpi4py.MPI.IN_PLACE, buf) param.grad /= self.size