def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_a.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather( self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(self.gpu_buffer_a.array(n_elems_total)) _memory_utility.unpack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)
def allreduce_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill, stream) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill, stream)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather( self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_a)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast( self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)