def sbmv(k, alpha, a, x, beta, y, lower=False): """Computes y = alpha*A @ x + beta * y """ dtype = a.dtype.char if dtype == 'f': func = cublas.ssbmv elif dtype == 'd': func = cublas.dsbmv else: raise TypeError('Complex dtypes not supported') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape assert x.shape[0] == n assert y.shape[0] == n if not a._f_contiguous: a = a.copy(order='F') alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) if lower: uplo = cublas.CUBLAS_FILL_MODE_LOWER else: uplo = cublas.CUBLAS_FILL_MODE_UPPER handle = device.get_cublas_handle() try: func(handle, uplo, n, k, alpha_ptr, a.data.ptr, m, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode) return y
def nrm2(x, out=None): """Computes the Euclidean norm of vector x.""" if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': func = cublas.snrm2 elif dtype == 'd': func = cublas.dnrm2 elif dtype == 'F': func = cublas.scnrm2 elif dtype == 'D': func = cublas.dznrm2 else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() result_dtype = dtype.lower() result_ptr, result, orig_mode = _setup_result_ptr( handle, out, result_dtype) try: func(handle, x.size, x.data.ptr, 1, result_ptr) finally: cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: _core.elementwise_copy(result, out) return out
def dotc(x, y, out=None): """Computes the dot product of x.conj() and y.""" dtype = x.dtype.char if dtype in 'fd': return dot(x, y, out=out) elif dtype == 'F': func = cublas.cdotc elif dtype == 'D': func = cublas.zdotc else: raise TypeError('invalid dtype') _check_two_vectors(x, y) handle = device.get_cublas_handle() result_dtype = dtype result_ptr, result, orig_mode = _setup_result_ptr( handle, out, result_dtype) try: func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr) finally: cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: _core.elementwise_copy(result, out) return out
def _iamaxmin(x, out, name): if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': t = 's' elif dtype == 'd': t = 'd' elif dtype == 'F': t = 'c' elif dtype == 'D': t = 'z' else: raise TypeError('invalid dtype') func = getattr(cublas, 'i' + t + name) handle = device.get_cublas_handle() result_dtype = 'i' result_ptr, result, orig_mode = _setup_result_ptr( handle, out, result_dtype) try: func(handle, x.size, x.data.ptr, 1, result_ptr) finally: cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: _core.elementwise_copy(result, out) return out
def axpy(a, x, y): """Computes y += a * x. (*) y will be updated. """ _check_two_vectors(x, y) dtype = x.dtype.char if dtype == 'f': func = cublas.saxpy elif dtype == 'd': func = cublas.daxpy elif dtype == 'F': func = cublas.caxpy elif dtype == 'D': func = cublas.zaxpy else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() a, a_ptr, orig_mode = _setup_scalar_ptr(handle, a, dtype) try: func(handle, x.size, a_ptr, x.data.ptr, 1, y.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode)
def dot(x, y, out=None): """Computes the dot product of x and y.""" dtype = x.dtype.char if dtype == 'f': func = cublas.sdot elif dtype == 'd': func = cublas.ddot elif dtype in 'FD': raise TypeError('Use dotu() or dotc() for complex dtype') else: raise TypeError('invalid dtype') _check_two_vectors(x, y) handle = device.get_cublas_handle() result_dtype = dtype result_ptr, result, orig_mode = _setup_result_ptr(handle, out, result_dtype) func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr) cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: out[...] = result return out
def _make_compute_hu(V): handle = device.get_cublas_handle() if V.dtype.char == 'f': gemv = _cublas.sgemv elif V.dtype.char == 'd': gemv = _cublas.dgemv elif V.dtype.char == 'F': gemv = _cublas.cgemv elif V.dtype.char == 'D': gemv = _cublas.zgemv n = V.shape[0] one = numpy.array(1.0, V.dtype) zero = numpy.array(0.0, V.dtype) mone = numpy.array(-1.0, V.dtype) def compute_hu(u, j): # h = V[:, :j+1].conj().T @ u # u -= V[:, :j+1] @ h h = cupy.empty((j + 1, ), dtype=V.dtype) gemv(handle, _cublas.CUBLAS_OP_C, n, j + 1, one.ctypes.data, V.data.ptr, n, u.data.ptr, 1, zero.ctypes.data, h.data.ptr, 1) gemv(handle, _cublas.CUBLAS_OP_N, n, j + 1, mone.ctypes.data, V.data.ptr, n, h.data.ptr, 1, one.ctypes.data, u.data.ptr, 1) return h, u return compute_hu
def asum(x, out=None): """Computes the sum of the absolute of x.""" if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': func = cublas.sasum elif dtype == 'd': func = cublas.dasum elif dtype == 'F': func = cublas.scasum elif dtype == 'D': func = cublas.dzasum else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() result_dtype = dtype.lower() result_ptr, result, orig_mode = _setup_result_ptr(handle, out, result_dtype) func(handle, x.size, x.data.ptr, 1, result_ptr) cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: out[...] = result return out
def _batched_inv(a): assert a.ndim >= 3 _util._assert_cupy_array(a) _util._assert_stacked_square(a) dtype, out_dtype = _util.linalg_common_type(a) if dtype == cupy.float32: getrf = cupy.cuda.cublas.sgetrfBatched getri = cupy.cuda.cublas.sgetriBatched elif dtype == cupy.float64: getrf = cupy.cuda.cublas.dgetrfBatched getri = cupy.cuda.cublas.dgetriBatched elif dtype == cupy.complex64: getrf = cupy.cuda.cublas.cgetrfBatched getri = cupy.cuda.cublas.cgetriBatched elif dtype == cupy.complex128: getrf = cupy.cuda.cublas.zgetrfBatched getri = cupy.cuda.cublas.zgetriBatched else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) if 0 in a.shape: return cupy.empty_like(a, dtype=out_dtype) a_shape = a.shape # copy is necessary to present `a` to be overwritten. a = a.astype(dtype, order='C').reshape(-1, a_shape[-2], a_shape[-1]) handle = device.get_cublas_handle() batch_size = a.shape[0] n = a.shape[1] lda = n step = n * lda * a.itemsize start = a.data.ptr stop = start + step * batch_size a_array = cupy.arange(start, stop, step, dtype=cupy.uintp) pivot_array = cupy.empty((batch_size, n), dtype=cupy.int32) info_array = cupy.empty((batch_size, ), dtype=cupy.int32) getrf(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, info_array.data.ptr, batch_size) cupy.linalg._util._check_cublas_info_array_if_synchronization_allowed( getrf, info_array) c = cupy.empty_like(a) ldc = lda step = n * ldc * c.itemsize start = c.data.ptr stop = start + step * batch_size c_array = cupy.arange(start, stop, step, dtype=cupy.uintp) getri(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, c_array.data.ptr, ldc, info_array.data.ptr, batch_size) cupy.linalg._util._check_cublas_info_array_if_synchronization_allowed( getri, info_array) return c.reshape(a_shape).astype(out_dtype, copy=False)
def scal(a, x): """Computes x *= a. (*) x will be updated. """ if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': func = cublas.sscal elif dtype == 'd': func = cublas.dscal elif dtype == 'F': func = cublas.cscal elif dtype == 'D': func = cublas.zscal else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() a, a_ptr, orig_mode = _setup_scalar_ptr(handle, a, dtype) try: func(handle, x.size, a_ptr, x.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode)
def gerc(alpha, x, y, a): """Computes a += alpha * x @ y.T.conj() Note: ''a'' will be updated. """ dtype = a.dtype.char if dtype in 'fd': return ger(alpha, x, y, a) elif dtype == 'F': func = cublas.cgerc elif dtype == 'D': func = cublas.zgerc else: raise TypeError('invalid dtype') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape assert x.shape[0] == m assert y.shape[0] == n handle = device.get_cublas_handle() alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype) x_ptr, y_ptr = x.data.ptr, y.data.ptr try: if a._f_contiguous: func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m) else: aa = a.copy(order='F') func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m) _core.elementwise_copy(aa, a) finally: cublas.setPointerMode(handle, orig_mode)
def dgmm(side, a, x, out=None, incx=1): """Computes diag(x) @ a or a @ diag(x) Computes diag(x) @ a if side is 'L', a @ diag(x) if side is 'R'. """ assert a.ndim == 2 assert 0 <= x.ndim <= 2 assert a.dtype == x.dtype dtype = a.dtype.char if dtype == 'f': func = cublas.sdgmm elif dtype == 'd': func = cublas.ddgmm elif dtype == 'F': func = cublas.cdgmm elif dtype == 'D': func = cublas.zdgmm else: raise TypeError('invalid dtype') if side == 'L' or side == cublas.CUBLAS_SIDE_LEFT: side = cublas.CUBLAS_SIDE_LEFT elif side == 'R' or side == cublas.CUBLAS_SIDE_RIGHT: side = cublas.CUBLAS_SIDE_RIGHT else: raise ValueError('invalid side (actual: {})'.format(side)) m, n = a.shape if side == cublas.CUBLAS_SIDE_LEFT: assert x.size >= (m - 1) * abs(incx) + 1 else: assert x.size >= (n - 1) * abs(incx) + 1 if out is None: if a._c_contiguous: order = 'C' else: order = 'F' out = cupy.empty((m, n), dtype=dtype, order=order) else: assert out.ndim == 2 assert out.shape == a.shape assert out.dtype == a.dtype handle = device.get_cublas_handle() if out._c_contiguous: if not a._c_contiguous: a = a.copy(order='C') func(handle, 1 - side, n, m, a.data.ptr, n, x.data.ptr, incx, out.data.ptr, n) else: if not a._f_contiguous: a = a.copy(order='F') c = out if not out._f_contiguous: c = out.copy(order='F') func(handle, side, m, n, a.data.ptr, m, x.data.ptr, incx, c.data.ptr, m) if not out._f_contiguous: _core.elementwise_copy(c, out) return out
def solve(a, b): """Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(..., M, M)``. b (cupy.ndarray): The matrix with dimension ``(...,M)`` or ``(..., M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(..., M)`` or ``(..., M, K)``. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.solve` """ # NOTE: Since cusolver in CUDA 8.0 does not support gesv, # we manually solve a linear system with QR decomposition. # For details, please see the following: # https://docs.nvidia.com/cuda/cusolver/index.html#qr_examples util._assert_cupy_array(a, b) util._assert_nd_squareness(a) if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1) and a.shape[:-1] == b.shape[:a.ndim - 1]): raise ValueError( 'a must have (..., M, M) shape and b must have (..., M) ' 'or (..., M, K)') # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()) cublas_handle = device.get_cublas_handle() cusolver_handle = device.get_cusolver_handle() a = a.astype(dtype) b = b.astype(dtype) if a.ndim == 2: return _solve(a, b, cublas_handle, cusolver_handle) x = cupy.empty_like(b) shape = a.shape[:-2] for i in six.moves.range(numpy.prod(shape)): index = numpy.unravel_index(i, shape) x[index] = _solve(a[index], b[index], cublas_handle, cusolver_handle) return x
def solve(a, b): """Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(..., M, M)``. b (cupy.ndarray): The matrix with dimension ``(...,M)`` or ``(..., M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(..., M)`` or ``(..., M, K)``. .. seealso:: :func:`numpy.linalg.solve` """ # NOTE: Since cusolver in CUDA 8.0 does not support gesv, # we manually solve a linear system with QR decomposition. # For details, please see the following: # https://docs.nvidia.com/cuda/cusolver/index.html#qr_examples if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') util._assert_cupy_array(a, b) util._assert_nd_squareness(a) if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1) and a.shape[:-1] == b.shape[:a.ndim - 1]): raise ValueError( 'a must have (..., M, M) shape and b must have (..., M) ' 'or (..., M, K)') # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()) cublas_handle = device.get_cublas_handle() cusolver_handle = device.get_cusolver_handle() a = a.astype(dtype) b = b.astype(dtype) if a.ndim == 2: return _solve(a, b, cublas_handle, cusolver_handle) x = cupy.empty_like(b) shape = a.shape[:-2] for i in six.moves.range(numpy.prod(shape)): index = numpy.unravel_index(i, shape) x[index] = _solve(a[index], b[index], cublas_handle, cusolver_handle) return x
def cupy_trsm_wrapper(a, b): cublas_handle = device.get_cublas_handle() a = cp.array(a, dtype=np.float64, order='F') b = cp.array(b, dtype=np.float64, order='F') trsm = cublas.dtrsm uplo = cublas.CUBLAS_FILL_MODE_LOWER trans = cublas.CUBLAS_OP_T side = cublas.CUBLAS_SIDE_RIGHT diag = cublas.CUBLAS_DIAG_NON_UNIT m, n = (b.side, 1) if b.ndim == 1 else b.shape trsm(cublas_handle, side, uplo, trans, diag, m, n, 1.0, a.data.ptr, m, b.data.ptr, m) return b
def __init__(self, A, V, alpha, beta, update_impl='fast'): assert A.ndim == V.ndim == 2 assert alpha.ndim == beta.ndim == 1 assert A.dtype == V.dtype == alpha.dtype assert A.dtype.char.lower() == beta.dtype.char assert A.shape[0] == A.shape[1] == V.shape[1] assert V.shape[0] == alpha.shape[0] == beta.shape[0] self.A = A self.V = V self.alpha = alpha self.beta = beta self.n = V.shape[1] self.ncv = V.shape[0] self.update_impl = update_impl if self.update_impl != 'fast': return self.cublas_handle = device.get_cublas_handle() self.cublas_pointer_mode = _cublas.getPointerMode(self.cublas_handle) if A.dtype.char == 'f': self.dotc = _cublas.sdot self.nrm2 = _cublas.snrm2 self.gemm = _cublas.sgemm elif A.dtype.char == 'd': self.dotc = _cublas.ddot self.nrm2 = _cublas.dnrm2 self.gemm = _cublas.dgemm elif A.dtype.char == 'F': self.dotc = _cublas.cdotc self.nrm2 = _cublas.scnrm2 self.gemm = _cublas.cgemm elif A.dtype.char == 'D': self.dotc = _cublas.zdotc self.nrm2 = _cublas.dznrm2 self.gemm = _cublas.zgemm else: raise TypeError('invalid dtype ({})'.format(A.dtype)) if csr.isspmatrix_csr(A) and cusparse.check_availability('spmv'): self.cusparse_handle = device.get_cusparse_handle() self.spmv_op_a = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE self.spmv_alpha = numpy.array(1.0, A.dtype) self.spmv_beta = numpy.array(0.0, A.dtype) self.spmv_cuda_dtype = cusparse._dtype_to_DataType(A.dtype) self.spmv_alg = _cusparse.CUSPARSE_MV_ALG_DEFAULT else: self.cusparse_handle = None self.v = cupy.empty((self.n, ), dtype=A.dtype) self.u = cupy.empty((self.n, ), dtype=A.dtype) self.uu = cupy.empty((self.ncv, ), dtype=A.dtype)
def _solve(a, b): a = cupy.asfortranarray(a) b = cupy.asfortranarray(b) dtype = a.dtype m, k = (b.size, 1) if b.ndim == 1 else b.shape cusolver_handle = device.get_cusolver_handle() cublas_handle = device.get_cublas_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf = cusolver.sgeqrf geqrf_bufferSize = cusolver.sgeqrf_bufferSize ormqr = cusolver.sormqr trsm = cublas.strsm else: # dtype == 'd' geqrf = cusolver.dgeqrf geqrf_bufferSize = cusolver.dgeqrf_bufferSize ormqr = cusolver.dormqr trsm = cublas.dtrsm # 1. QR decomposition (A = Q * R) buffersize = geqrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(m, dtype=dtype) geqrf(cusolver_handle, m, m, a.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 2. ormqr (Q^T * B) ormqr(cusolver_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_OP_T, m, k, m, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 3. trsm (X = R^{-1} * (Q^T * B)) trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_FILL_MODE_UPPER, cublas.CUBLAS_OP_N, cublas.CUBLAS_DIAG_NON_UNIT, m, k, 1, a.data.ptr, m, b.data.ptr, m) return b
def ger(alpha, x, y, a): """Computes a += alpha * x @ y.T Note: ''a'' will be updated. """ dtype = a.dtype.char if dtype == 'f': func = cublas.sger elif dtype == 'd': func = cublas.dger elif dtype in 'FD': raise TypeError('Use geru or gerc for complex dtypes') else: raise TypeError('invalid dtype') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape assert x.shape[0] == m assert y.shape[0] == n handle = device.get_cublas_handle() alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype) x_ptr, y_ptr = x.data.ptr, y.data.ptr try: if a._f_contiguous: func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m) elif a._c_contiguous: func(handle, n, m, alpha_ptr, y_ptr, 1, x_ptr, 1, a.data.ptr, n) else: aa = a.copy(order='F') func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m) a[...] = aa finally: cublas.setPointerMode(handle, orig_mode)
def _lu_factor(a_t, dtype): """Compute pivoted LU decomposition. Decompose a given batch of square matrices. Inputs and outputs are transposed. Args: a_t (cupy.ndarray): The input matrix with dimension ``(..., N, N)``. The dimension condition is not checked. dtype (numpy.dtype): float32, float64, complex64, or complex128. Returns: lu_t (cupy.ndarray): ``L`` without its unit diagonal and ``U`` with dimension ``(..., N, N)``. piv (cupy.ndarray): 1-origin pivot indices with dimension ``(..., N)``. dev_info (cupy.ndarray): ``getrf`` info with dimension ``(...)``. .. seealso:: :func:`scipy.linalg.lu_factor` """ orig_shape = a_t.shape n = orig_shape[-2] # copy is necessary to present `a` to be overwritten. a_t = a_t.astype(dtype, order='C').reshape(-1, n, n) batch_size = a_t.shape[0] ipiv = cupy.empty((batch_size, n), dtype=numpy.int32) dev_info = cupy.empty((batch_size, ), dtype=numpy.int32) # Heuristic condition from some performance test. # TODO(kataoka): autotune use_batched = batch_size * 65536 >= n * n if use_batched: handle = device.get_cublas_handle() lda = n step = n * lda * a_t.itemsize start = a_t.data.ptr stop = start + step * batch_size a_array = cupy.arange(start, stop, step, dtype=cupy.uintp) if dtype == numpy.float32: getrfBatched = cupy.cuda.cublas.sgetrfBatched elif dtype == numpy.float64: getrfBatched = cupy.cuda.cublas.dgetrfBatched elif dtype == numpy.complex64: getrfBatched = cupy.cuda.cublas.cgetrfBatched elif dtype == numpy.complex128: getrfBatched = cupy.cuda.cublas.zgetrfBatched else: assert False getrfBatched(handle, n, a_array.data.ptr, lda, ipiv.data.ptr, dev_info.data.ptr, batch_size) else: handle = device.get_cusolver_handle() if dtype == numpy.float32: getrf_bufferSize = cusolver.sgetrf_bufferSize getrf = cusolver.sgetrf elif dtype == numpy.float64: getrf_bufferSize = cusolver.dgetrf_bufferSize getrf = cusolver.dgetrf elif dtype == numpy.complex64: getrf_bufferSize = cusolver.cgetrf_bufferSize getrf = cusolver.cgetrf elif dtype == numpy.complex128: getrf_bufferSize = cusolver.zgetrf_bufferSize getrf = cusolver.zgetrf else: assert False for i in range(batch_size): a_ptr = a_t[i].data.ptr buffersize = getrf_bufferSize(handle, n, n, a_ptr, n) workspace = cupy.empty(buffersize, dtype=dtype) getrf(handle, n, n, a_ptr, n, workspace.data.ptr, ipiv[i].data.ptr, dev_info[i].data.ptr) return ( a_t.reshape(orig_shape), ipiv.reshape(orig_shape[:-1]), dev_info.reshape(orig_shape[:-2]), )
def solve_triangular(a, b, trans=0, lower=False, unit_diagonal=False, overwrite_b=False, check_finite=False): """Solve the equation a x = b for x, assuming a is a triangular matrix. Args: a (cupy.ndarray): The matrix with dimension ``(M, M)``. b (cupy.ndarray): The matrix with dimension ``(M,)`` or ``(M, N)``. lower (bool): Use only data contained in the lower triangle of ``a``. Default is to use upper triangle. trans (0, 1, 2, 'N', 'T' or 'C'): Type of system to solve: - *'0'* or *'N'* -- :math:`a x = b` - *'1'* or *'T'* -- :math:`a^T x = b` - *'2'* or *'C'* -- :math:`a^H x = b` unit_diagonal (bool): If ``True``, diagonal elements of ``a`` are assumed to be 1 and will not be referenced. overwrite_b (bool): Allow overwriting data in b (may enhance performance) check_finite (bool): Whether to check that the input matrices contain only finite numbers. Disabling may give a performance gain, but may result in problems (crashes, non-termination) if the inputs do contain infinities or NaNs. Returns: cupy.ndarray: The matrix with dimension ``(M,)`` or ``(M, N)``. .. seealso:: :func:`scipy.linalg.solve_triangular` """ _util._assert_cupy_array(a, b) if len(a.shape) != 2 or a.shape[0] != a.shape[1]: raise ValueError('expected square matrix') if len(a) != len(b): raise ValueError('incompatible dimensions') # Cast to float32 or float64 if a.dtype.char in 'fd': dtype = a.dtype else: dtype = numpy.promote_types(a.dtype.char, 'f') a = cupy.array(a, dtype=dtype, order='F', copy=False) b = cupy.array(b, dtype=dtype, order='F', copy=(not overwrite_b)) if check_finite: if a.dtype.kind == 'f' and not cupy.isfinite(a).all(): raise ValueError('array must not contain infs or NaNs') if b.dtype.kind == 'f' and not cupy.isfinite(b).all(): raise ValueError('array must not contain infs or NaNs') m, n = (b.size, 1) if b.ndim == 1 else b.shape cublas_handle = device.get_cublas_handle() if dtype == 'f': trsm = cublas.strsm elif dtype == 'd': trsm = cublas.dtrsm elif dtype == 'F': trsm = cublas.ctrsm else: # dtype == 'D' trsm = cublas.ztrsm one = numpy.array(1, dtype=dtype) if lower: uplo = cublas.CUBLAS_FILL_MODE_LOWER else: uplo = cublas.CUBLAS_FILL_MODE_UPPER if trans == 'N': trans = cublas.CUBLAS_OP_N elif trans == 'T': trans = cublas.CUBLAS_OP_T elif trans == 'C': trans = cublas.CUBLAS_OP_C if unit_diagonal: diag = cublas.CUBLAS_DIAG_UNIT else: diag = cublas.CUBLAS_DIAG_NON_UNIT trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, uplo, trans, diag, m, n, one.ctypes.data, a.data.ptr, m, b.data.ptr, m) return b
def _lanczos_fast(A, n, ncv): cublas_handle = device.get_cublas_handle() cublas_pointer_mode = _cublas.getPointerMode(cublas_handle) if A.dtype.char == 'f': dotc = _cublas.sdot nrm2 = _cublas.snrm2 gemm = _cublas.sgemm elif A.dtype.char == 'd': dotc = _cublas.ddot nrm2 = _cublas.dnrm2 gemm = _cublas.dgemm elif A.dtype.char == 'F': dotc = _cublas.cdotc nrm2 = _cublas.scnrm2 gemm = _cublas.cgemm elif A.dtype.char == 'D': dotc = _cublas.zdotc nrm2 = _cublas.dznrm2 gemm = _cublas.zgemm else: raise TypeError('invalid dtype ({})'.format(A.dtype)) cusparse_handle = None if csr.isspmatrix_csr(A) and cusparse.check_availability('spmv'): cusparse_handle = device.get_cusparse_handle() spmv_op_a = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE spmv_alpha = numpy.array(1.0, A.dtype) spmv_beta = numpy.array(0.0, A.dtype) spmv_cuda_dtype = _dtype.to_cuda_dtype(A.dtype) spmv_alg = _cusparse.CUSPARSE_MV_ALG_DEFAULT v = cupy.empty((n, ), dtype=A.dtype) uu = cupy.empty((ncv, ), dtype=A.dtype) one = numpy.array(1.0, dtype=A.dtype) zero = numpy.array(0.0, dtype=A.dtype) mone = numpy.array(-1.0, dtype=A.dtype) outer_A = A def aux(A, V, u, alpha, beta, i_start, i_end): assert A is outer_A beta_eps = inversion_eps(A.dtype) # Get ready for spmv if enabled if cusparse_handle is not None: # Note: I would like to reuse descriptors and working buffer # on the next update, but I gave it up because it sometimes # caused illegal memory access error. spmv_desc_A = cusparse.SpMatDescriptor.create(A) spmv_desc_v = cusparse.DnVecDescriptor.create(v) spmv_desc_u = cusparse.DnVecDescriptor.create(u) buff_size = _cusparse.spMV_bufferSize( cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg) spmv_buff = cupy.empty(buff_size, cupy.int8) v[...] = V[i_start] for i in range(i_start, i_end): # Matrix-vector multiplication if cusparse_handle is None: u[...] = A @ v else: _cusparse.spMV(cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg, spmv_buff.data.ptr) # Call dotc _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1, alpha.data.ptr + i * alpha.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Orthogonalize gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1, i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n, zero.ctypes.data, uu.data.ptr, 1) gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1, i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1, one.ctypes.data, u.data.ptr, n) # Call nrm2 _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: nrm2(cublas_handle, n, u.data.ptr, 1, beta.data.ptr + i * beta.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Break here as the normalization below touches V[i+1] if i >= i_end - 1: break if beta[i] < beta_eps: V[i + 1:i_end, :] = 0 u[...] = 0 v[...] = 0 break if i == i_start: beta_eps *= beta[i] # scale eps to largest beta # Normalize _kernel_normalize(u, beta, i, n, v, V) return aux
def _batched_inv(a): assert (a.ndim >= 3) util._assert_cupy_array(a) util._assert_nd_squareness(a) if a.dtype == cupy.float32: getrf = cupy.cuda.cublas.sgetrfBatched getri = cupy.cuda.cublas.sgetriBatched elif a.dtype == cupy.float64: getrf = cupy.cuda.cublas.dgetrfBatched getri = cupy.cuda.cublas.dgetriBatched elif a.dtype == cupy.complex64: getrf = cupy.cuda.cublas.cgetrfBatched getri = cupy.cuda.cublas.cgetriBatched elif a.dtype == cupy.complex128: getrf = cupy.cuda.cublas.zgetrfBatched getri = cupy.cuda.cublas.zgetriBatched else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) if 0 in a.shape: return cupy.empty_like(a) a_shape = a.shape # copy is necessary to present `a` to be overwritten. a = a.copy().reshape(-1, a_shape[-2], a_shape[-1]) handle = device.get_cublas_handle() batch_size = a.shape[0] n = a.shape[1] lda = n step = n * lda * a.itemsize start = a.data.ptr stop = start + step * batch_size a_array = cupy.arange(start, stop, step, dtype=cupy.uintp) pivot_array = cupy.empty((batch_size, n), dtype=cupy.int32) info_array = cupy.empty((batch_size, ), dtype=cupy.int32) getrf(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, info_array.data.ptr, batch_size) err = False err_detail = '' for i in range(batch_size): info = info_array[i] if info < 0: err = True err_detail += ('\tmatrix[{}]: illegal value at {}-the parameter.' '\n'.format(i, -info)) if info > 0: err = True err_detail += '\tmatrix[{}]: matrix is singular.\n'.format(i) if err: raise RuntimeError('matrix inversion failed at getrf.\n' + err_detail) c = cupy.empty_like(a) ldc = lda step = n * ldc * c.itemsize start = c.data.ptr stop = start + step * batch_size c_array = cupy.arange(start, stop, step, dtype=cupy.uintp) getri(handle, n, a_array.data.ptr, lda, pivot_array.data.ptr, c_array.data.ptr, ldc, info_array.data.ptr, batch_size) for i in range(batch_size): info = info_array[i] if info > 0: err = True err_detail += '\tmatrix[{}]: matrix is singular.\n'.format(i) if err: raise RuntimeError('matrix inversion failed at getri.\n' + err_detail) return c.reshape(a_shape)
def solve(a, b): '''Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(M, M)`` b (cupy.ndarray): The vector with ``M`` elements, or the matrix with dimension ``(M, K)`` Returns: cupy.ndarray: The vector with ``M`` elements, or the matrix with dimension ``(M, K)``. .. seealso:: :func:`numpy.linalg.solve` ''' # NOTE: Since cusolver in CUDA 8.0 does not support gesv, # we manually solve a linear system with QR decomposition. # For details, please see the following: # https://docs.nvidia.com/cuda/cusolver/index.html#qr_examples if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a, b) util._assert_rank2(a) util._assert_nd_squareness(a) if 2 < b.ndim: raise linalg.LinAlgError('{}-dimensional array given. Array must be ' 'one or two-dimensional'.format(b.ndim)) if len(a) != len(b): raise linalg.LinAlgError('The number of rows of array a must be ' 'the same as that of array b') # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char m, k = (b.size, 1) if b.ndim == 1 else b.shape a = a.transpose().astype(dtype, order='C', copy=True) b = b.transpose().astype(dtype, order='C', copy=True) cusolver_handle = device.get_cusolver_handle() cublas_handle = device.get_cublas_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf = cusolver.sgeqrf geqrf_bufferSize = cusolver.sgeqrf_bufferSize ormqr = cusolver.sormqr trsm = cublas.strsm else: # dtype == 'd' geqrf = cusolver.dgeqrf geqrf_bufferSize = cusolver.dgeqrf_bufferSize ormqr = cusolver.dormqr trsm = cublas.dtrsm # 1. QR decomposition (A = Q * R) buffersize = geqrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(m, dtype=dtype) geqrf(cusolver_handle, m, m, a.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 2. ormqr (Q^T * B) ormqr(cusolver_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_OP_T, m, k, m, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 3. trsm (X = R^{-1} * (Q^T * B)) trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_FILL_MODE_UPPER, cublas.CUBLAS_OP_N, cublas.CUBLAS_DIAG_NON_UNIT, m, k, 1, a.data.ptr, m, b.data.ptr, m) return b.transpose()
def gels(a, b): """Solves over/well/under-determined linear systems. Computes least-square solution to equation ``ax = b` by QR factorization using cusolverDn<t>geqrf(). Args: a (cupy.ndarray): The matrix with dimension ``(M, N)``. b (cupy.ndarray): The matrix with dimension ``(M)`` or ``(M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(N)`` or ``(N, K)``. """ if a.ndim != 2: raise ValueError('a.ndim must be 2 (actual: {})'.format(a.ndim)) if b.ndim == 1: nrhs = 1 elif b.ndim == 2: nrhs = b.shape[1] else: raise ValueError('b.ndim must be 1 or 2 (actual: {})'.format(b.ndim)) if a.shape[0] != b.shape[0]: raise ValueError('shape mismatch (a: {}, b: {}).'.format( a.shape, b.shape)) if a.dtype != b.dtype: raise ValueError('dtype mismatch (a: {}, b: {}).'.format( a.dtype, b.dtype)) dtype = a.dtype if dtype == 'f': t = 's' elif dtype == 'd': t = 'd' elif dtype == 'F': t = 'c' elif dtype == 'D': t = 'z' else: raise ValueError('unsupported dtype (actual: {})'.format(dtype)) geqrf_helper = getattr(_cusolver, t + 'geqrf_bufferSize') geqrf = getattr(_cusolver, t + 'geqrf') trsm = getattr(_cublas, t + 'trsm') if t in 'sd': ormqr_helper = getattr(_cusolver, t + 'ormqr_bufferSize') ormqr = getattr(_cusolver, t + 'ormqr') else: ormqr_helper = getattr(_cusolver, t + 'unmqr_bufferSize') ormqr = getattr(_cusolver, t + 'unmqr') no_trans = _cublas.CUBLAS_OP_N if dtype.char in 'fd': trans = _cublas.CUBLAS_OP_T else: trans = _cublas.CUBLAS_OP_C m, n = a.shape mn_min = min(m, n) dev_info = _cupy.empty(1, dtype=_numpy.int32) tau = _cupy.empty(mn_min, dtype=dtype) cusolver_handle = _device.get_cusolver_handle() cublas_handle = _device.get_cublas_handle() one = _numpy.array(1.0, dtype=dtype) if m >= n: # over/well-determined systems a = a.copy(order='F') b = b.copy(order='F') # geqrf (QR decomposition, A = Q * R) ws_size = geqrf_helper(cusolver_handle, m, n, a.data.ptr, m) workspace = _cupy.empty(ws_size, dtype=dtype) geqrf(cusolver_handle, m, n, a.data.ptr, m, tau.data.ptr, workspace.data.ptr, ws_size, dev_info.data.ptr) _cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( geqrf, dev_info) # ormqr (Computes Q^T * B) ws_size = ormqr_helper(cusolver_handle, _cublas.CUBLAS_SIDE_LEFT, trans, m, nrhs, mn_min, a.data.ptr, m, tau.data.ptr, b.data.ptr, m) workspace = _cupy.empty(ws_size, dtype=dtype) ormqr(cusolver_handle, _cublas.CUBLAS_SIDE_LEFT, trans, m, nrhs, mn_min, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr, ws_size, dev_info.data.ptr) _cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( ormqr, dev_info) # trsm (Solves R * X = (Q^T * B)) trsm(cublas_handle, _cublas.CUBLAS_SIDE_LEFT, _cublas.CUBLAS_FILL_MODE_UPPER, no_trans, _cublas.CUBLAS_DIAG_NON_UNIT, mn_min, nrhs, one.ctypes.data, a.data.ptr, m, b.data.ptr, m) return b[:n] else: # under-determined systems a = a.conj().T.copy(order='F') bb = b out_shape = (n, ) if b.ndim == 1 else (n, nrhs) b = _cupy.zeros(out_shape, dtype=dtype, order='F') b[:m] = bb # geqrf (QR decomposition, A^T = Q * R) ws_size = geqrf_helper(cusolver_handle, n, m, a.data.ptr, n) workspace = _cupy.empty(ws_size, dtype=dtype) geqrf(cusolver_handle, n, m, a.data.ptr, n, tau.data.ptr, workspace.data.ptr, ws_size, dev_info.data.ptr) _cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( geqrf, dev_info) # trsm (Solves R^T * Z = B) trsm(cublas_handle, _cublas.CUBLAS_SIDE_LEFT, _cublas.CUBLAS_FILL_MODE_UPPER, trans, _cublas.CUBLAS_DIAG_NON_UNIT, m, nrhs, one.ctypes.data, a.data.ptr, n, b.data.ptr, n) # ormqr (Computes Q * Z) ws_size = ormqr_helper(cusolver_handle, _cublas.CUBLAS_SIDE_LEFT, no_trans, n, nrhs, mn_min, a.data.ptr, n, tau.data.ptr, b.data.ptr, n) workspace = _cupy.empty(ws_size, dtype=dtype) ormqr(cusolver_handle, _cublas.CUBLAS_SIDE_LEFT, no_trans, n, nrhs, mn_min, a.data.ptr, n, tau.data.ptr, b.data.ptr, n, workspace.data.ptr, ws_size, dev_info.data.ptr) _cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( ormqr, dev_info) return b
def syrk(trans, a, out=None, alpha=1.0, beta=0.0, lower=False): """Computes out := alpha*op1(a)*op2(a) + beta*out op1(a) = a if trans is 'N', op2(a) = a.T if transa is 'N' op1(a) = a.T if trans is 'T', op2(a) = a if transa is 'T' lower specifies whether the upper or lower triangular part of the array out is to be referenced """ assert a.ndim == 2 dtype = a.dtype.char if dtype == 'f': func = cublas.ssyrk elif dtype == 'd': func = cublas.dsyrk elif dtype == 'F': func = cublas.csyrk elif dtype == 'D': func = cublas.zsyrk else: raise TypeError('invalid dtype') trans = _trans_to_cublas_op(trans) if trans == cublas.CUBLAS_OP_N: n, k = a.shape else: k, n = a.shape if out is None: out = cupy.zeros((n, n), dtype=dtype, order='F') beta = 0.0 else: assert out.ndim == 2 assert out.shape == (n, n) assert out.dtype == dtype if lower: uplo = cublas.CUBLAS_FILL_MODE_LOWER else: uplo = cublas.CUBLAS_FILL_MODE_UPPER alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) lda, trans = _decide_ld_and_trans(a, trans) ldo, _ = _decide_ld_and_trans(out, trans) if out._c_contiguous: if not a._c_contiguous: a = a.copy(order='C') trans = 1 - trans lda = a.shape[1] try: func(handle, 1 - uplo, trans, n, k, alpha_ptr, a.data.ptr, lda, beta_ptr, out.data.ptr, ldo) finally: cublas.setPointerMode(handle, orig_mode) else: if not a._f_contiguous: a = a.copy(order='F') lda = a.shape[0] trans = 1 - trans c = out if not out._f_contiguous: c = out.copy(order='F') try: func(handle, uplo, trans, n, k, alpha_ptr, a.data.ptr, lda, beta_ptr, out.data.ptr, ldo) finally: cublas.setPointerMode(handle, orig_mode) if not out._f_contiguous: out[...] = c return out
def __init__(self, p, m, Nr, Nz, rmax, use_cuda=False ): """ Calculate the r (position) and nu (frequency) grid on which the transform will operate. Also store auxiliary data needed for the transform. Parameters: ------------ p: int Order of the Hankel transform m: int The azimuthal mode for which the Hankel transform is calculated Nr, Nz: float Number of points in the r direction and z direction rmax: float Edge of the box in which the Hankel transform is taken (The function is assumed to be zero at that point.) use_cuda: bool, optional Whether to use the GPU for the Hankel transform """ # Register whether to use the GPU. # If yes, initialize the corresponding cuda object self.use_cuda = use_cuda if (self.use_cuda==True) and (cuda_installed==False): self.use_cuda = False print('** Cuda not available for Hankel transform.') print('** Performing the Hankel transform on the CPU.') # Check that m has a valid value if (m in [p-1, p, p+1]) == False: raise ValueError('m must be either p-1, p or p+1') # Register values of the arguments self.p = p self.m = m self.Nr = Nr self.rmax = rmax self.Nz = Nz # Calculate the zeros of the Bessel function if m !=0: # In this case, 0 is a zero of the Bessel function of order m. # It turns out that it is needed to reconstruct the signal for p=0. alphas = np.hstack( (np.array([0.]), jn_zeros(m, Nr-1)) ) else: alphas = jn_zeros(m, Nr) # Calculate the spectral grid self.nu = 1./(2*np.pi*rmax) * alphas # Calculate the spatial grid (Uniform grid with an half-cell offset) self.r = (rmax*1./Nr) * ( np.arange(Nr) + 0.5 ) # Calculate and store the inverse matrix invM # (imposed by the constraints on the DHT of Bessel modes) # NB: When compared with the FBPIC article, all the matrices here # are calculated in transposed form. This is done so as to use the # `dot` and `gemm` functions, in the `transform` method. self.invM = np.empty((Nr, Nr)) if p == m: p_denom = p+1 else: p_denom = p denom = np.pi * rmax**2 * jn( p_denom, alphas)**2 num = jn( p, 2*np.pi* self.r[np.newaxis,:]*self.nu[:,np.newaxis] ) # Get the inverse matrix if m!=0: self.invM[1:, :] = num[1:, :] / denom[1:, np.newaxis] # In this case, the functions are represented by Bessel functions # *and* an additional mode (below) which satisfies the same # algebric relations for curl/div/grad as the regular Bessel modes, # with the value kperp=0. # The normalization of this mode is arbitrary, and is chosen # so that the condition number of invM is close to 1 if p==m-1: self.invM[0, :] = self.r**(m-1) * 1./( np.pi * rmax**(m+1) ) else: self.invM[0, :] = 0. else : self.invM[:, :] = num[:, :] / denom[:, np.newaxis] # Calculate the matrix M by inverting invM self.M = np.empty((Nr, Nr)) if m !=0 and p != m-1: self.M[:, 1:] = np.linalg.pinv( self.invM[1:,:] ) self.M[:, 0] = 0. else: self.M = np.linalg.inv( self.invM ) # Copy the matrices to the GPU if needed if self.use_cuda: self.d_M = cupy.asarray( self.M ) self.d_invM = cupy.asarray( self.invM ) # Initialize buffer arrays to store the complex Nz x Nr grid # as a real 2Nz x Nr grid, before performing the matrix product # (This is because a matrix product of reals is faster than a matrix # product of complexs, and the real-complex conversion is negligible.) if not self.use_cuda: # Initialize real buffer arrays on the CPU zero_array = np.zeros((2*Nz, Nr), dtype=np.float64) self.array_in = zero_array.copy() self.array_out = zero_array.copy() else: # Initialize real buffer arrays on the GPU zero_array = np.zeros((2*Nz, Nr), dtype=np.float64) self.d_in = cupy.asarray( zero_array ) self.d_out = cupy.asarray( zero_array ) # Initialize cuBLAS self.blas = device.get_cublas_handle() # Set optimal number of CUDA threads per block # for copy 2d real/complex (determined empirically) copy_tpb = (8,32) if cuda_gpu_model == "V100" else (2,16) # Initialize the threads per block and block per grid self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, *copy_tpb)
def geam(transa, transb, alpha, a, beta, b, out=None): """Computes alpha * op(a) + beta * op(b) op(a) = a if transa is 'N', op(a) = a.T if transa is 'T', op(a) = a.T.conj() if transa is 'H'. op(b) = b if transb is 'N', op(b) = b.T if transb is 'T', op(b) = b.T.conj() if transb is 'H'. """ assert a.ndim == b.ndim == 2 assert a.dtype == b.dtype dtype = a.dtype.char if dtype == 'f': func = cublas.sgeam elif dtype == 'd': func = cublas.dgeam elif dtype == 'F': func = cublas.cgeam elif dtype == 'D': func = cublas.zgeam else: raise TypeError('invalid dtype') transa = _trans_to_cublas_op(transa) transb = _trans_to_cublas_op(transb) if transa == cublas.CUBLAS_OP_N: m, n = a.shape else: n, m = a.shape if transb == cublas.CUBLAS_OP_N: assert b.shape == (m, n) else: assert b.shape == (n, m) if out is None: out = cupy.empty((m, n), dtype=dtype, order='F') else: assert out.ndim == 2 assert out.shape == (m, n) assert out.dtype == dtype alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) lda, transa = _decide_ld_and_trans(a, transa) ldb, transb = _decide_ld_and_trans(b, transb) if not (lda is None or ldb is None): if out._f_contiguous: try: func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, m) finally: cublas.setPointerMode(handle, orig_mode) return out elif out._c_contiguous: # Computes alpha * a.T + beta * b.T try: func(handle, 1-transa, 1-transb, n, m, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, n) finally: cublas.setPointerMode(handle, orig_mode) return out a, lda = _change_order_if_necessary(a, lda) b, ldb = _change_order_if_necessary(b, ldb) c = out if not out._f_contiguous: c = out.copy(order='F') try: func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, c.data.ptr, m) finally: cublas.setPointerMode(handle, orig_mode) if not out._f_contiguous: _core.elementwise_copy(c, out) return out
def batched_gesv(a, b): """Solves multiple linear matrix equations using cublas<t>getr[fs]Batched(). Computes the solution to system of linear equation ``ax = b``. Args: a (cupy.ndarray): The matrix with dimension ``(..., M, M)``. b (cupy.ndarray): The matrix with dimension ``(..., M)`` or ``(..., M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(..., M)`` or ``(..., M, K)``. """ _util._assert_cupy_array(a, b) _util._assert_stacked_2d(a) _util._assert_stacked_square(a) # TODO(kataoka): Support broadcast if not ( (a.ndim == b.ndim or a.ndim == b.ndim + 1) and a.shape[:-1] == b.shape[:a.ndim - 1] ): raise ValueError( 'a must have (..., M, M) shape and b must have (..., M) ' 'or (..., M, K)') dtype, out_dtype = _util.linalg_common_type(a, b) if b.size == 0: return cupy.empty(b.shape, out_dtype) if dtype == 'f': t = 's' elif dtype == 'd': t = 'd' elif dtype == 'F': t = 'c' elif dtype == 'D': t = 'z' else: raise TypeError('invalid dtype') getrf = getattr(cublas, t + 'getrfBatched') getrs = getattr(cublas, t + 'getrsBatched') bs = numpy.prod(a.shape[:-2]) if a.ndim > 2 else 1 n = a.shape[-1] nrhs = b.shape[-1] if a.ndim == b.ndim else 1 b_shape = b.shape a_data_ptr = a.data.ptr b_data_ptr = b.data.ptr a = cupy.ascontiguousarray(a.reshape(bs, n, n).transpose(0, 2, 1), dtype=dtype) b = cupy.ascontiguousarray(b.reshape(bs, n, nrhs).transpose(0, 2, 1), dtype=dtype) if a.data.ptr == a_data_ptr: a = a.copy() if b.data.ptr == b_data_ptr: b = b.copy() if n > get_batched_gesv_limit(): warnings.warn('The matrix size ({}) exceeds the set limit ({})'. format(n, get_batched_gesv_limit())) handle = device.get_cublas_handle() lda = n a_step = lda * n * a.itemsize a_array = cupy.arange(a.data.ptr, a.data.ptr + a_step * bs, a_step, dtype=cupy.uintp) ldb = n b_step = ldb * nrhs * b.itemsize b_array = cupy.arange(b.data.ptr, b.data.ptr + b_step * bs, b_step, dtype=cupy.uintp) pivot = cupy.empty((bs, n), dtype=numpy.int32) dinfo = cupy.empty((bs,), dtype=numpy.int32) info = numpy.empty((1,), dtype=numpy.int32) # LU factorization (A = L * U) getrf(handle, n, a_array.data.ptr, lda, pivot.data.ptr, dinfo.data.ptr, bs) _util._check_cublas_info_array_if_synchronization_allowed(getrf, dinfo) # Solves Ax = b getrs(handle, cublas.CUBLAS_OP_N, n, nrhs, a_array.data.ptr, lda, pivot.data.ptr, b_array.data.ptr, ldb, info.ctypes.data, bs) if info[0] != 0: msg = 'Error reported by {} in cuBLAS. '.format(getrs.__name__) if info[0] < 0: msg += 'The {}-th parameter had an illegal value.'.format(-info[0]) raise linalg.LinAlgError(msg) return b.transpose(0, 2, 1).reshape(b_shape).astype(out_dtype, copy=False)
def gemv(transa, alpha, a, x, beta, y): """Computes y = alpha * op(a) @ x + beta * y op(a) = a if transa is 'N', op(a) = a.T if transa is 'T', op(a) = a.T.conj() if transa is 'H'. Note: ''y'' will be updated. """ dtype = a.dtype.char if dtype == 'f': func = cublas.sgemv elif dtype == 'd': func = cublas.dgemv elif dtype == 'F': func = cublas.cgemv elif dtype == 'D': func = cublas.zgemv else: raise TypeError('invalid dtype') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape transa = _trans_to_cublas_op(transa) if transa == cublas.CUBLAS_OP_N: xlen, ylen = n, m else: xlen, ylen = m, n assert x.shape[0] == xlen assert y.shape[0] == ylen alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) try: if a._f_contiguous: func(handle, transa, m, n, alpha_ptr, a.data.ptr, m, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) elif a._c_contiguous and transa != cublas.CUBLAS_OP_C: if transa == cublas.CUBLAS_OP_N: transa = cublas.CUBLAS_OP_T else: transa = cublas.CUBLAS_OP_N func(handle, transa, n, m, alpha_ptr, a.data.ptr, n, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) else: a = a.copy(order='F') func(handle, transa, m, n, alpha_ptr, a.data.ptr, m, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode)