def bcast_data(self, model): self._init_comms() params = _memory_utility.extract_params_set_data(model) data_dtype = _get_param_data_dtype(params[0]) n_elems = sum(param.data.size for param in params) data_grad_n_bytes = data_dtype.itemsize * n_elems if self.gpu_tmp_buffer.size != data_grad_n_bytes: self.gpu_tmp_buffer.assign(data_grad_n_bytes) stream = chainer.cuda.Stream.null _memory_utility.pack_params(params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype) self.nccl_comm.bcast( self.gpu_tmp_buffer.ptr(), n_elems, _communication_utility._get_nccl_type_id(data_dtype), 0, stream.ptr) _memory_utility.unpack_params(params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b, transfer_dtype=dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def _unpack_params_from_buffer(self, params, grad_dtype, allreduce_grad_dtype, n_elems, stream): if self.batched_copy: if self.params_data is not None: params_data = self.params_data self.params_data = None else: params_data = _ParamsData(params, 'grad') _batched_unpack_params(params_data, self.gpu_buffer_a, allreduce_grad_dtype) return if grad_dtype == allreduce_grad_dtype: _memory_utility.unpack_params( params, allreduce_grad_dtype.itemsize, 'grad', self.gpu_buffer_a, stream) else: if self.allreduce_dtype_to_grad_dtype_kernel is None: self.allreduce_dtype_to_grad_dtype_kernel = \ _get_converting_kernel( allreduce_grad_dtype, grad_dtype, 'allreduce_dtype_to_grad_dtype_kernel') self.allreduce_dtype_to_grad_dtype_kernel( self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_tmp_buffer.array(n_elems, dtype=grad_dtype), stream=stream) _memory_utility.unpack_params( params, grad_dtype.itemsize, 'grad', self.gpu_tmp_buffer, stream=stream)
def bcast_data(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_data(model) dtype = params[0].data.dtype itemsize = dtype.itemsize n_elems_total = sum(param.data.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'data', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.bcast( self.gpu_buffer_a.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'data', self.gpu_buffer_a, transfer_dtype=dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Same as PureNcclCommunicator's multi_node_mean but leave as it is self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(arr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b, transfer_dtype=dtype)
def _unpack_params_from_buffer(self, params, grad_dtype, allreduce_grad_dtype, n_elems, stream): if self.batched_copy: if self.params_data is not None: params_data = self.params_data self.params_data = None else: params_data = _ParamsData(params, 'grad') _batched_unpack_params(params_data, self.gpu_buffer_a, allreduce_grad_dtype) return if grad_dtype == allreduce_grad_dtype: _memory_utility.unpack_params( params, allreduce_grad_dtype.itemsize, 'grad', self.gpu_buffer_a, stream, transfer_dtype=grad_dtype) else: if self.allreduce_dtype_to_grad_dtype_kernel is None: self.allreduce_dtype_to_grad_dtype_kernel = \ _get_converting_kernel( allreduce_grad_dtype, grad_dtype, 'allreduce_dtype_to_grad_dtype_kernel') self.allreduce_dtype_to_grad_dtype_kernel( self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_tmp_buffer.array(n_elems, dtype=grad_dtype), stream=stream) _memory_utility.unpack_params( params, grad_dtype.itemsize, 'grad', self.gpu_tmp_buffer, stream=stream, transfer_dtype=grad_dtype)
def _unpack_params_from_buffer(self, params, grad_dtype, allreduce_grad_dtype, n_elems, stream): if grad_dtype == allreduce_grad_dtype: _memory_utility.unpack_params(params, allreduce_grad_dtype.itemsize, 'grad', self.gpu_allreduce_buffer_a, stream) else: if self.allreduce_dtype_to_grad_dtype_kernel is None: self.allreduce_dtype_to_grad_dtype_kernel = \ _get_converting_kernel( allreduce_grad_dtype, grad_dtype, 'allreduce_dtype_to_grad_dtype_kernel') self.allreduce_dtype_to_grad_dtype_kernel( self.gpu_allreduce_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_tmp_buffer.array(n_elems, dtype=grad_dtype), stream=stream) _memory_utility.unpack_params(params, grad_dtype.itemsize, 'grad', self.gpu_tmp_buffer, stream=stream)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Same as PureNcclCommunicator's multi_node_mean but leave as it is self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(arr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
def allreduce_grad(self, model, stream=None): self._init_comms() if stream is None: stream = chainer.cuda.Stream.null params = _memory_utility.extract_params(model) itemsize = 4 n_elems = sum(param.grad.size for param in params) n_bytes = itemsize * n_elems self.gpu_buffer_a.assign(n_bytes) self.gpu_buffer_b.assign(n_bytes) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) if stream != chainer.cuda.Stream.null: chainer.cuda.Stream.null.synchronize() self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) if stream != chainer.cuda.Stream.null: stream.synchronize() ret = self.gpu_buffer_b.array(n_elems) * (1.0 / self.size) self.gpu_buffer_b.from_device(ret, n_bytes) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) # Intra-node reduce self.intra_nccl_comm.reduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast( self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params( params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast( self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_a)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_a)
def multi_node_mean_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, zero_fill) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, zero_fill)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_a.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill)
def _unpack_params_from_buffer(self, params, allreduce_grad_dtype, stream): if self.batched_copy: if self.params_data is not None: params_data = self.params_data self.params_data = None else: params_data = _ParamsData(params, 'grad') _batched_unpack_params(params_data, self.gpu_buffer_a, allreduce_grad_dtype) return else: _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, stream)
def allreduce_grad(self, model): self._init_comms() params = [param for _, param in sorted(model.namedparams())] itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_a)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather( self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(self.gpu_buffer_a.array(n_elems_total)) _memory_utility.unpack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)
def allreduce_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill, stream) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill, stream)
def allreduce_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total), self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill)
def broadcast_data(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = [param for _, param in sorted(model.namedparams())] itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'data', self.gpu_buffer_a) self.intra_nccl_comm.bcast(self.gpu_buffer_a.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'data', self.gpu_buffer_a)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total), self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) allreduce_grad_dtype = np.float32 _memory_utility.pack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total), self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params( params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype)
def bcast_data(self, model): self._init_comms() params = _memory_utility.extract_params_set_data(model) data_dtype = _get_param_data_dtype(params[0]) n_elems = sum(param.data.size for param in params) data_grad_n_bytes = data_dtype.itemsize * n_elems if self.gpu_tmp_buffer.size != data_grad_n_bytes: self.gpu_tmp_buffer.assign(data_grad_n_bytes) stream = chainer.cuda.Stream.null _memory_utility.pack_params( params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream) self.nccl_comm.bcast(self.gpu_tmp_buffer.ptr(), n_elems, _get_nccl_type_id(data_dtype), 0, stream.ptr) _memory_utility.unpack_params( params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream)
def _unpack_params_from_buffer(self, params, attr_name, buffer, allreduce_grad_dtype, zero_fill, stream=None): if self.batched_copy: if self.params_data is not None: params_data = self.params_data self.params_data = None else: params_data = _memory_utility.ParamsData( params, attr_name, zero_fill) _memory_utility._batched_unpack_params( params_data, buffer, allreduce_grad_dtype) return else: _memory_utility.unpack_params( params, attr_name, buffer, allreduce_grad_dtype, zero_fill, stream)
def bcast_data(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_data(model) itemsize = 4 n_elems_total = sum(param.data.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'data', self.gpu_buffer_a) self.intra_nccl_comm.bcast( self.gpu_buffer_a.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params( params, itemsize, 'data', self.gpu_buffer_a)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) self.mpi_comm.Allreduce( [self.gpu_buffer_a.buffer(n_bytes_total), mpi4py.MPI.FLOAT], [self.gpu_buffer_b.buffer(n_bytes_total), mpi4py.MPI.FLOAT]) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)
def bcast_data(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_data(model) dtype = params[0].data.dtype itemsize = dtype.itemsize n_elems_total = sum(param.data.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, 'data', self.gpu_buffer_a, dtype) self.intra_nccl_comm.bcast( self.gpu_buffer_a.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), 0, stream.ptr) _memory_utility.unpack_params(params, 'data', self.gpu_buffer_a, dtype)
def allreduce_grad(self, model): self._init_comms() params = [param for _, param in sorted(model.namedparams())] itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) self.mpi_comm.Allreduce( [self.gpu_buffer_a.buffer(n_bytes_total), mpi4py.MPI.FLOAT], [self.gpu_buffer_b.buffer(n_bytes_total), mpi4py.MPI.FLOAT]) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def _unpack_params_from_buffer(self, params, grad_dtype, allreduce_grad_dtype, n_elems, stream): if grad_dtype == allreduce_grad_dtype: _memory_utility.unpack_params( params, allreduce_grad_dtype.itemsize, 'grad', self.gpu_buffer_a, stream) else: if self.allreduce_dtype_to_grad_dtype_kernel is None: self.allreduce_dtype_to_grad_dtype_kernel = \ _get_converting_kernel( allreduce_grad_dtype, grad_dtype, 'allreduce_dtype_to_grad_dtype_kernel') self.allreduce_dtype_to_grad_dtype_kernel( self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_tmp_buffer.array(n_elems, dtype=grad_dtype), stream=stream) _memory_utility.unpack_params( params, grad_dtype.itemsize, 'grad', self.gpu_tmp_buffer, stream=stream)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = [param for _, param in sorted(model.namedparams())] itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) self.nccl_comm.allreduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) ret = self.gpu_buffer_b.array(n_elems_total) * (1.0 / self.size) self.gpu_buffer_b.from_device(ret, n_bytes_buffer) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather( self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_a)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast( self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast( self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params( params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill)