def cholesky(a): '''Cholesky decomposition. Decompose a given two-dimensional square matrix into ``L * L.T``, where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate transpose operator. Note that in the current implementation ``a`` must be a real matrix, and only float32 and float64 are supported. Args: a (cupy.ndarray): The input matrix with dimension ``(N, N)`` .. seealso:: :func:`numpy.linalg.cholesky` ''' if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays _assert_cupy_array(a) _assert_rank2(a) _assert_nd_squareness(a) # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char x = a.astype(dtype, copy=True) n = len(a) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': buffersize = cusolver.spotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float32) cusolver.spotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) else: # dtype == 'd' buffersize = cusolver.dpotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float64) cusolver.dpotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status > 0: raise linalg.LinAlgError( 'The leading minor of order {} ' 'is not positive definite'.format(status)) elif status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') _tril(x, k=0) return x
def _lu_factor(a, overwrite_a=False, check_finite=True): a = cupy.asarray(a) _util._assert_rank2(a) dtype = a.dtype if dtype.char == 'f': getrf = cusolver.sgetrf getrf_bufferSize = cusolver.sgetrf_bufferSize elif dtype.char == 'd': getrf = cusolver.dgetrf getrf_bufferSize = cusolver.dgetrf_bufferSize elif dtype.char == 'F': getrf = cusolver.cgetrf getrf_bufferSize = cusolver.cgetrf_bufferSize elif dtype.char == 'D': getrf = cusolver.zgetrf getrf_bufferSize = cusolver.zgetrf_bufferSize else: msg = 'Only float32, float64, complex64 and complex128 are supported.' raise NotImplementedError(msg) a = a.astype(dtype, order='F', copy=(not overwrite_a)) if check_finite: if a.dtype.kind == 'f' and not cupy.isfinite(a).all(): raise ValueError('array must not contain infs or NaNs') cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) m, n = a.shape ipiv = cupy.empty((min(m, n), ), dtype=numpy.intc) buffersize = getrf_bufferSize(cusolver_handle, m, n, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) # LU factorization getrf(cusolver_handle, m, n, a.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) if dev_info[0] < 0: raise ValueError('illegal value in %d-th argument of ' 'internal getrf (lu_factor)' % -dev_info[0]) elif dev_info[0] > 0: warn('Diagonal number %d is exactly zero. Singular matrix.' % dev_info[0], RuntimeWarning, stacklevel=2) # cuSolver uses 1-origin while SciPy uses 0-origin ipiv -= 1 return (a, ipiv)
def cholesky(a): '''Cholesky decomposition. Decompose a given two-dimensional square matrix into ``L * L.T``, where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate transpose operator. Note that in the current implementation ``a`` must be a real matrix, and only float32 and float64 are supported. Args: a (cupy.ndarray): The input matrix with dimension ``(N, N)`` .. seealso:: :func:`numpy.linalg.cholesky` ''' if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays _assert_cupy_array(a) _assert_rank2(a) _assert_nd_squareness(a) # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char x = a.astype(dtype, order='C', copy=True) n = len(a) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': buffersize = cusolver.spotrf_bufferSize(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float32) cusolver.spotrf(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) else: # dtype == 'd' buffersize = cusolver.dpotrf_bufferSize(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float64) cusolver.dpotrf(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status > 0: raise linalg.LinAlgError('The leading minor of order {} ' 'is not positive definite'.format(status)) elif status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') _tril(x, k=0) return x
def solve(a, b): """Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(..., M, M)``. b (cupy.ndarray): The matrix with dimension ``(...,M)`` or ``(..., M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(..., M)`` or ``(..., M, K)``. .. seealso:: :func:`numpy.linalg.solve` """ # NOTE: Since cusolver in CUDA 8.0 does not support gesv, # we manually solve a linear system with QR decomposition. # For details, please see the following: # https://docs.nvidia.com/cuda/cusolver/index.html#qr_examples if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') util._assert_cupy_array(a, b) util._assert_nd_squareness(a) if not ((a.ndim == b.ndim or a.ndim == b.ndim + 1) and a.shape[:-1] == b.shape[:a.ndim - 1]): raise ValueError( 'a must have (..., M, M) shape and b must have (..., M) ' 'or (..., M, K)') # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()) cublas_handle = device.get_cublas_handle() cusolver_handle = device.get_cusolver_handle() a = a.astype(dtype) b = b.astype(dtype) if a.ndim == 2: return _solve(a, b, cublas_handle, cusolver_handle) x = cupy.empty_like(b) shape = a.shape[:-2] for i in six.moves.range(numpy.prod(shape)): index = numpy.unravel_index(i, shape) x[index] = _solve(a[index], b[index], cublas_handle, cusolver_handle) return x
def _potrf_batched(a): """Batched Cholesky decomposition. Decompose a given array of two-dimensional square matrices into ``L * L.T``, where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate transpose operator. Args: a (cupy.ndarray): The input array of matrices with dimension ``(..., N, N)`` Returns: cupy.ndarray: The lower-triangular matrix. """ if not check_availability('potrfBatched'): raise RuntimeError('potrfBatched is not available') if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.promote_types(a.dtype.char, 'f').char if dtype == 'f': potrfBatched = cusolver.spotrfBatched elif dtype == 'd': potrfBatched = cusolver.dpotrfBatched elif dtype == 'F': potrfBatched = cusolver.cpotrfBatched else: # dtype == 'D': potrfBatched = cusolver.zpotrfBatched x = a.astype(dtype, order='C', copy=True) xp = cupy.core._mat_ptrs(x) n = x.shape[-1] ldx = x.strides[-2] // x.dtype.itemsize handle = device.get_cusolver_handle() batch_size = internal.prod(x.shape[:-2]) dev_info = cupy.empty(batch_size, dtype=numpy.int32) potrfBatched( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, xp.data.ptr, ldx, dev_info.data.ptr, batch_size) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( potrfBatched, dev_info) return cupy.tril(x)
def _slogdet_one(a): util._assert_rank2(a) util._assert_nd_squareness(a) dtype = a.dtype handle = device.get_cusolver_handle() m = len(a) ipiv = cupy.empty(m, dtype=numpy.int32) dev_info = cupy.empty((), dtype=numpy.int32) # Need to make a copy because getrf works inplace a_copy = a.copy(order='F') if dtype == 'f': getrf_bufferSize = cusolver.sgetrf_bufferSize getrf = cusolver.sgetrf else: getrf_bufferSize = cusolver.dgetrf_bufferSize getrf = cusolver.dgetrf buffersize = getrf_bufferSize(handle, m, m, a_copy.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) getrf(handle, m, m, a_copy.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) # dev_info < 0 means illegal value (in dimensions, strides, and etc.) that # should never happen even if the matrix contains nan or inf. # TODO(kataoka): assert dev_info >= 0 if synchronization is allowed for # debugging purposes. diag = cupy.diag(a_copy) # ipiv is 1-origin non_zero = (cupy.count_nonzero(ipiv != cupy.arange(1, m + 1)) + cupy.count_nonzero(diag < 0)) # Note: sign == -1 ** (non_zero % 2) sign = (non_zero % 2) * -2 + 1 logdet = cupy.log(abs(diag)).sum() singular = dev_info > 0 return ( cupy.where(singular, dtype.type(0), sign), cupy.where(singular, dtype.type('-inf'), logdet), )
def _slogdet_one(a): util._assert_rank2(a) util._assert_nd_squareness(a) dtype = a.dtype handle = device.get_cusolver_handle() m = len(a) ipiv = cupy.empty(m, dtype=numpy.int32) dev_info = cupy.empty(1, dtype=numpy.int32) # Need to make a copy because getrf works inplace a_copy = a.copy(order='F') if dtype == 'f': getrf_bufferSize = cusolver.sgetrf_bufferSize getrf = cusolver.sgetrf else: getrf_bufferSize = cusolver.dgetrf_bufferSize getrf = cusolver.dgetrf buffersize = getrf_bufferSize(handle, m, m, a_copy.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) getrf(handle, m, m, a_copy.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) try: cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( getrf, dev_info) diag = cupy.diag(a_copy) # ipiv is 1-origin non_zero = (cupy.count_nonzero(ipiv != cupy.arange(1, m + 1)) + cupy.count_nonzero(diag < 0)) # Note: sign == -1 ** (non_zero % 2) sign = (non_zero % 2) * -2 + 1 logdet = cupy.log(abs(diag)).sum() except linalg.LinAlgError: sign = cupy.array(0.0, dtype=dtype) logdet = cupy.array(float('-inf'), dtype) return sign, logdet
def _slogdet_one(a): util._assert_rank2(a) util._assert_nd_squareness(a) dtype = a.dtype handle = device.get_cusolver_handle() m = len(a) ipiv = cupy.empty(m, 'i') info = cupy.empty((), 'i') # Need to make a copy because getrf works inplace a_copy = a.copy(order='F') if dtype == 'f': getrf_bufferSize = cusolver.sgetrf_bufferSize getrf = cusolver.sgetrf else: getrf_bufferSize = cusolver.dgetrf_bufferSize getrf = cusolver.dgetrf buffersize = getrf_bufferSize(handle, m, m, a_copy.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) getrf(handle, m, m, a_copy.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, info.data.ptr) if info[()] == 0: diag = cupy.diag(a_copy) # ipiv is 1-origin non_zero = (cupy.count_nonzero(ipiv != cupy.arange(1, m + 1)) + cupy.count_nonzero(diag < 0)) # Note: sign == -1 ** (non_zero % 2) sign = (non_zero % 2) * -2 + 1 logdet = cupy.log(abs(diag)).sum() else: sign = cupy.array(0.0, dtype=dtype) logdet = cupy.array(float('-inf'), dtype) return sign, logdet
def _solve(a, b): a = cupy.asfortranarray(a) b = cupy.asfortranarray(b) dtype = a.dtype m, k = (b.size, 1) if b.ndim == 1 else b.shape cusolver_handle = device.get_cusolver_handle() cublas_handle = device.get_cublas_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf = cusolver.sgeqrf geqrf_bufferSize = cusolver.sgeqrf_bufferSize ormqr = cusolver.sormqr trsm = cublas.strsm else: # dtype == 'd' geqrf = cusolver.dgeqrf geqrf_bufferSize = cusolver.dgeqrf_bufferSize ormqr = cusolver.dormqr trsm = cublas.dtrsm # 1. QR decomposition (A = Q * R) buffersize = geqrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(m, dtype=dtype) geqrf(cusolver_handle, m, m, a.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 2. ormqr (Q^T * B) ormqr(cusolver_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_OP_T, m, k, m, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 3. trsm (X = R^{-1} * (Q^T * B)) trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_FILL_MODE_UPPER, cublas.CUBLAS_OP_N, cublas.CUBLAS_DIAG_NON_UNIT, m, k, 1, a.data.ptr, m, b.data.ptr, m) return b
def invh(a): """Compute the inverse of a Hermitian matrix. This function computes a inverse of a real symmetric or complex hermitian positive-definite matrix using Cholesky factorization. If matrix ``a`` is not positive definite, Cholesky factorization fails and it raises an error. Args: a (cupy.ndarray): Real symmetric or complex hermitian maxtix. Returns: cupy.ndarray: The inverse of matrix ``a``. """ # to prevent `a` from being overwritten a = a.copy() util._assert_cupy_array(a) util._assert_rank2(a) util._assert_nd_squareness(a) # support float32, float64, complex64, and complex128 if a.dtype.char in 'fdFD': dtype = a.dtype.char else: dtype = numpy.promote_types(a.dtype.char, 'f').char cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': potrf = cusolver.spotrf potrf_bufferSize = cusolver.spotrf_bufferSize potrs = cusolver.spotrs elif dtype == 'd': potrf = cusolver.dpotrf potrf_bufferSize = cusolver.dpotrf_bufferSize potrs = cusolver.dpotrs elif dtype == 'F': potrf = cusolver.cpotrf potrf_bufferSize = cusolver.cpotrf_bufferSize potrs = cusolver.cpotrs elif dtype == 'D': potrf = cusolver.zpotrf potrf_bufferSize = cusolver.zpotrf_bufferSize potrs = cusolver.zpotrs else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) m = a.shape[0] uplo = cublas.CUBLAS_FILL_MODE_LOWER worksize = potrf_bufferSize(cusolver_handle, uplo, m, a.data.ptr, m) workspace = cupy.empty(worksize, dtype=dtype) # Cholesky factorization potrf(cusolver_handle, uplo, m, a.data.ptr, m, workspace.data.ptr, worksize, dev_info.data.ptr) info = dev_info[0] if info != 0: if info < 0: msg = '\tThe {}-th parameter is wrong'.format(-info) else: msg = ('\tThe leading minor of order {} is not positive definite' .format(info)) raise RuntimeError('matrix inversion failed at potrf.\n' + msg) b = cupy.eye(m, dtype=dtype) # Solve: A * X = B potrs(cusolver_handle, uplo, m, m, a.data.ptr, m, b.data.ptr, m, dev_info.data.ptr) info = dev_info[0] if info > 0: assert False, ('Unexpected output returned by potrs (actual: {})' .format(info)) elif info < 0: raise RuntimeError('matrix inversion failed at potrs.\n' '\tThe {}-th parameter is wrong'.format(-info)) return b
def lu_factor(a, overwrite_a=False, check_finite=True): """LU decomposition. Decompose a given two-dimensional square matrix into ``P * L * U``, where ``P`` is a permutation matrix, ``L`` lower-triangular with unit diagonal elements, and ``U`` upper-triangular matrix. Note that in the current implementation ``a`` must be a real matrix, and only :class:`numpy.float32` and :class:`numpy.float64` are supported. Args: a (cupy.ndarray): The input matrix with dimension ``(M, N)`` overwrite_a (bool): Allow overwriting data in ``a`` (may enhance performance) check_finite (bool): Whether to check that the input matrices contain only finite numbers. Disabling may give a performance gain, but may result in problems (crashes, non-termination) if the inputs do contain infinities or NaNs. Returns: tuple: ``(lu, piv)`` where ``lu`` is a :class:`cupy.ndarray` storing ``U`` in its upper triangle, and ``L`` without unit diagonal elements in its lower triangle, and ``piv`` is a :class:`cupy.ndarray` storing pivot indices representing permutation matrix ``P``. For ``0 <= i < min(M,N)``, row ``i`` of the matrix was interchanged with row ``piv[i]`` .. seealso:: :func:`scipy.linalg.lu_factor` .. note:: Current implementation returns result different from SciPy when the matrix singular. SciPy returns an array containing ``0.`` while the current implementation returns an array containing ``nan``. >>> import numpy as np >>> import scipy.linalg >>> scipy.linalg.lu_factor(np.array([[0, 1], [0, 0]], \ dtype=np.float32)) (array([[0., 1.], [0., 0.]], dtype=float32), array([0, 1], dtype=int32)) >>> import cupy as cp >>> import cupyx.scipy.linalg >>> cupyx.scipy.linalg.lu_factor(cp.array([[0, 1], [0, 0]], \ dtype=cp.float32)) (array([[ 0., 1.], [nan, nan]], dtype=float32), array([0, 1], dtype=int32)) """ a = cupy.asarray(a) util._assert_rank2(a) dtype = a.dtype if dtype.char == 'f': getrf = cusolver.sgetrf getrf_bufferSize = cusolver.sgetrf_bufferSize elif dtype.char == 'd': getrf = cusolver.dgetrf getrf_bufferSize = cusolver.dgetrf_bufferSize else: raise NotImplementedError('Only float32 and float64 are supported.') a = a.astype(dtype, order='F', copy=(not overwrite_a)) if check_finite: if a.dtype.kind == 'f' and not cupy.isfinite(a).all(): raise ValueError('array must not contain infs or NaNs') cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) m, n = a.shape ipiv = cupy.empty((min(m, n), ), dtype=numpy.intc) buffersize = getrf_bufferSize(cusolver_handle, m, n, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) # LU factorization getrf(cusolver_handle, m, n, a.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) if dev_info[0] < 0: raise ValueError('illegal value in %d-th argument of ' 'internal getrf (lu_factor)' % -dev_info[0]) elif dev_info[0] > 0: warn('Diagonal number %d is exactly zero. Singular matrix.' % dev_info[0], RuntimeWarning, stacklevel=2) # cuSolver uses 1-origin while SciPy uses 0-origin ipiv -= 1 return (a, ipiv)
def qr(a, mode='reduced'): '''QR decomposition. Decompose a given two-dimensional matrix into ``Q * R``, where ``Q`` is an orthonormal and ``R`` is an upper-triangular matrix. Args: a (cupy.ndarray): The input matrix. mode (str): The mode of decomposition. Currently 'reduced', 'complete', 'r', and 'raw' modes are supported. The default mode is 'reduced', and decompose a matrix ``A = (M, N)`` into ``Q``, ``R`` with dimensions ``(M, K)``, ``(K, N)``, where ``K = min(M, N)``. .. seealso:: :func:`numpy.linalg.qr` ''' if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays _assert_cupy_array(a) _assert_rank2(a) if mode not in ('reduced', 'complete', 'r', 'raw'): if mode in ('f', 'full', 'e', 'economic'): msg = 'The deprecated mode \'{}\' is not supported'.format(mode) raise ValueError(msg) else: raise ValueError('Unrecognized mode \'{}\''.format(mode)) # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char m, n = a.shape x = a.transpose().astype(dtype, copy=True) mn = min(m, n) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) # compute working space of geqrf and ormqr, and solve R if dtype == 'f': buffersize = cusolver.sgeqrf_bufferSize(handle, m, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float32) tau = cupy.empty(mn, dtype=numpy.float32) cusolver.sgeqrf( handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) else: # dtype == 'd' buffersize = cusolver.dgeqrf_bufferSize(handle, n, m, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float64) tau = cupy.empty(mn, dtype=numpy.float64) cusolver.dgeqrf( handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') if mode == 'r': r = x[:, :mn].transpose() return _triu(r) if mode == 'raw': if a.dtype.char == 'f': # The original numpy.linalg.qr returns float64 in raw mode, # whereas the cusolver returns float32. We agree that the # following code would be inappropriate, however, in this time # we explicitly convert them to float64 for compatibility. return x.astype(numpy.float64), tau.astype(numpy.float64) return x, tau if mode == 'complete' and m > n: mc = m q = cupy.empty((m, m), dtype) else: mc = mn q = cupy.empty((n, m), dtype) q[:n] = x # solve Q if dtype == 'f': buffersize = cusolver.sorgqr_bufferSize( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=numpy.float32) cusolver.sorgqr( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) else: buffersize = cusolver.dorgqr_bufferSize( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=numpy.float64) cusolver.dorgqr( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) q = q[:mc].transpose() r = x[:, :mc].transpose() return q, _triu(r)
def qr(a, mode='reduced'): '''QR decomposition. Decompose a given two-dimensional matrix into ``Q * R``, where ``Q`` is an orthonormal and ``R`` is an upper-triangular matrix. Args: a (cupy.ndarray): The input matrix. mode (str): The mode of decomposition. Currently 'reduced', 'complete', 'r', and 'raw' modes are supported. The default mode is 'reduced', and decompose a matrix ``A = (M, N)`` into ``Q``, ``R`` with dimensions ``(M, K)``, ``(K, N)``, where ``K = min(M, N)``. .. seealso:: :func:`numpy.linalg.qr` ''' if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays _assert_cupy_array(a) _assert_rank2(a) if mode not in ('reduced', 'complete', 'r', 'raw'): if mode in ('f', 'full', 'e', 'economic'): msg = 'The deprecated mode \'{}\' is not supported'.format(mode) raise ValueError(msg) else: raise ValueError('Unrecognized mode \'{}\''.format(mode)) # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char m, n = a.shape x = a.transpose().astype(dtype, copy=True) mn = min(m, n) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) # compute working space of geqrf and ormqr, and solve R if dtype == 'f': buffersize = cusolver.sgeqrf_bufferSize(handle, m, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float32) tau = cupy.empty(mn, dtype=numpy.float32) cusolver.sgeqrf( handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) else: # dtype == 'd' buffersize = cusolver.dgeqrf_bufferSize(handle, n, m, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float64) tau = cupy.empty(mn, dtype=numpy.float64) cusolver.dgeqrf( handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') if mode == 'r': r = x[:, :mn].transpose() return _triu(r) if mode == 'raw': if a.dtype.char == 'f': # The original numpy.linalg.qr returns float64 in raw mode, # whereas the cusolver returns float32. We agree that the # following code would be inappropriate, however, in this time # we explicitly convert them to float64 for compatibility. return x.astype(numpy.float64), tau.astype(numpy.float64) return x, tau if mode == 'complete' and m > n: mc = m q = cupy.empty((m, m), dtype) else: mc = mn q = cupy.empty((n, m), dtype) q[:n] = x # solve Q if dtype == 'f': buffersize = cusolver.sorgqr_bufferSize( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=numpy.float32) cusolver.sorgqr( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) else: buffersize = cusolver.dorgqr_bufferSize( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=numpy.float64) cusolver.dorgqr( handle, m, mc, mn, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) q = q[:mc].transpose() r = x[:, :mc].transpose() return q, _triu(r)
def gesv(a, b): """Solve a linear matrix equation using cusolverDn<t>getr[fs](). Computes the solution to a system of linear equation ``ax = b``. Args: a (cupy.ndarray): The matrix with dimension ``(M, M)``. b (cupy.ndarray): The matrix with dimension ``(M)`` or ``(M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(M)`` or ``(M, K)``. Note: ``a`` and ``b`` will be overwritten. """ if a.ndim != 2: raise ValueError('a.ndim must be 2 (actual: {})'.format(a.ndim)) if b.ndim not in (1, 2): raise ValueError('b.ndim must be 1 or 2 (actual: {})'.format(b.ndim)) if a.shape[0] != a.shape[1]: raise ValueError('a must be a square matrix.') if a.shape[0] != b.shape[0]: raise ValueError('shape mismatch (a: {}, b: {}).'.format( a.shape, b.shape)) if a.dtype != b.dtype: raise TypeError('dtype mismatch (a: {}, b: {})'.format( a.dtype, b.dtype)) dtype = a.dtype if dtype == 'f': t = 's' elif dtype == 'd': t = 'd' elif dtype == 'F': t = 'c' elif dtype == 'D': t = 'z' else: raise TypeError('unsupported dtype (actual:{})'.format(a.dtype)) helper = getattr(_cusolver, t + 'getrf_bufferSize') getrf = getattr(_cusolver, t + 'getrf') getrs = getattr(_cusolver, t + 'getrs') n = b.shape[0] nrhs = b.shape[1] if b.ndim == 2 else 1 if a._f_contiguous: trans = _cublas.CUBLAS_OP_N elif a._c_contiguous: trans = _cublas.CUBLAS_OP_T else: raise ValueError('a must be F-contiguous or C-contiguous.') if not b._f_contiguous: raise ValueError('b must be F-contiguous.') handle = _device.get_cusolver_handle() dipiv = _cupy.empty(n, dtype=_numpy.int32) dinfo = _cupy.empty(1, dtype=_numpy.int32) lwork = helper(handle, n, n, a.data.ptr, n) dwork = _cupy.empty(lwork, dtype=a.dtype) # LU factrization (A = L * U) getrf(handle, n, n, a.data.ptr, n, dwork.data.ptr, dipiv.data.ptr, dinfo.data.ptr) _cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( getrf, dinfo) # Solves Ax = b getrs(handle, trans, n, nrhs, a.data.ptr, n, dipiv.data.ptr, b.data.ptr, n, dinfo.data.ptr) _cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( getrs, dinfo)
def inv(a): """Computes the inverse of a matrix. This function computes matrix ``a_inv`` from n-dimensional regular matrix ``a`` such that ``dot(a, a_inv) == eye(n)``. Args: a (cupy.ndarray): The regular matrix Returns: cupy.ndarray: The inverse of a matrix. .. seealso:: :func:`numpy.linalg.inv` """ if a.ndim >= 3: return _batched_inv(a) if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # to prevent `a` to be overwritten a = a.copy() util._assert_cupy_array(a) util._assert_rank2(a) util._assert_nd_squareness(a) # support float32, float64, complex64, and complex128 if a.dtype.char in 'fdFD': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) ipiv = cupy.empty((a.shape[0], 1), dtype=numpy.intc) if dtype == 'f': getrf = cusolver.sgetrf getrf_bufferSize = cusolver.sgetrf_bufferSize getrs = cusolver.sgetrs elif dtype == 'd': getrf = cusolver.dgetrf getrf_bufferSize = cusolver.dgetrf_bufferSize getrs = cusolver.dgetrs elif dtype == 'F': getrf = cusolver.cgetrf getrf_bufferSize = cusolver.cgetrf_bufferSize getrs = cusolver.cgetrs elif dtype == 'D': getrf = cusolver.zgetrf getrf_bufferSize = cusolver.zgetrf_bufferSize getrs = cusolver.zgetrs else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) m = a.shape[0] buffersize = getrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) # LU factorization getrf(cusolver_handle, m, m, a.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) b = cupy.eye(m, dtype=dtype) # solve for the inverse getrs(cusolver_handle, 0, m, m, a.data.ptr, m, ipiv.data.ptr, b.data.ptr, m, dev_info.data.ptr) return b
def gesv(a, b): """Solve a linear matrix equation using cusolverDn<t1><t2>gesv(). Computes the solution to a system of linear equation ``ax = b``. Args: a (cupy.ndarray): The matrix with dimension ``(M, M)``. b (cupy.ndarray): The matrix with dimension ``(M)`` or ``(M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(M)`` or ``(M, K)``. """ if not check_availability('gesv'): raise RuntimeError('gesv is not available.') if a.ndim != 2: raise ValueError('a.ndim must be 2 (actual:{})'.format(a.ndim)) if b.ndim not in (1, 2): raise ValueError('b.ndim must be 1 or 2 (actual:{})'.format(b.ndim)) if a.shape[0] != a.shape[1]: raise ValueError('a must be a square matrix.') if a.shape[0] != b.shape[0]: raise ValueError('shape mismatch (a:{}, b:{}).'. format(a.shape, b.shape)) if a.dtype != b.dtype: raise ValueError('dtype mismatch (a:{}, b:{}).'. format(a.dtype, b.dtype)) if b.ndim == 2: n, nrhs = b.shape else: n, nrhs = b.shape[0], 1 compute_type = _linalg.get_compute_type(a.dtype) if a.dtype.char in 'fd': if a.dtype.char == 'f': t1 = t2 = 's' else: t1 = t2 = 'd' if compute_type == _linalg.COMPUTE_TYPE_FP16: t2 = 'h' elif compute_type == _linalg.COMPUTE_TYPE_TF32: t2 = 'x' elif compute_type == _linalg.COMPUTE_TYPE_FP32: t2 = 's' elif a.dtype.char in 'FD': if a.dtype.char == 'F': t1 = t2 = 'c' else: t1 = t2 = 'z' if compute_type == _linalg.COMPUTE_TYPE_FP16: t2 = 'k' elif compute_type == _linalg.COMPUTE_TYPE_TF32: t2 = 'y' elif compute_type == _linalg.COMPUTE_TYPE_FP32: t2 = 'c' else: raise ValueError('unsupported dtype (actual:{})'.format(a.dtype)) solver_name = t1 + t2 + 'gesv' solver = getattr(_cusolver, solver_name) helper = getattr(_cusolver, solver_name + '_bufferSize') a = a.copy(order='F') b = b.copy(order='F') x = _cupy.empty_like(b) dipiv = _cupy.empty(n, dtype=_numpy.int32) dinfo = _cupy.empty(1, dtype=_numpy.int32) handle = _device.get_cusolver_handle() lwork = helper(handle, n, nrhs, a.data.ptr, n, dipiv.data.ptr, b.data.ptr, n, x.data.ptr, n, 0) dwork = _cupy.empty(lwork, dtype=_numpy.int8) niters = solver(handle, n, nrhs, a.data.ptr, n, dipiv.data.ptr, b.data.ptr, n, x.data.ptr, n, dwork.data.ptr, lwork, dinfo.data.ptr) if niters < 0: raise RuntimeError('gesv has failed ({}).'.format(niters)) return x
def solve(a, b): '''Solves a linear matrix equation. It computes the exact solution of ``x`` in ``ax = b``, where ``a`` is a square and full rank matrix. Args: a (cupy.ndarray): The matrix with dimension ``(M, M)`` b (cupy.ndarray): The vector with ``M`` elements, or the matrix with dimension ``(M, K)`` Returns: cupy.ndarray: The vector with ``M`` elements, or the matrix with dimension ``(M, K)``. .. seealso:: :func:`numpy.linalg.solve` ''' # NOTE: Since cusolver in CUDA 8.0 does not support gesv, # we manually solve a linear system with QR decomposition. # For details, please see the following: # https://docs.nvidia.com/cuda/cusolver/index.html#qr_examples if not cuda.cusolver_enabled: raise RuntimeError('Current cupy only supports cusolver in CUDA 8.0') # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a, b) util._assert_rank2(a) util._assert_nd_squareness(a) if 2 < b.ndim: raise linalg.LinAlgError('{}-dimensional array given. Array must be ' 'one or two-dimensional'.format(b.ndim)) if len(a) != len(b): raise linalg.LinAlgError('The number of rows of array a must be ' 'the same as that of array b') # Cast to float32 or float64 if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char m, k = (b.size, 1) if b.ndim == 1 else b.shape a = a.transpose().astype(dtype, order='C', copy=True) b = b.transpose().astype(dtype, order='C', copy=True) cusolver_handle = device.get_cusolver_handle() cublas_handle = device.get_cublas_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf = cusolver.sgeqrf geqrf_bufferSize = cusolver.sgeqrf_bufferSize ormqr = cusolver.sormqr trsm = cublas.strsm else: # dtype == 'd' geqrf = cusolver.dgeqrf geqrf_bufferSize = cusolver.dgeqrf_bufferSize ormqr = cusolver.dormqr trsm = cublas.dtrsm # 1. QR decomposition (A = Q * R) buffersize = geqrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(m, dtype=dtype) geqrf(cusolver_handle, m, m, a.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 2. ormqr (Q^T * B) ormqr(cusolver_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_OP_T, m, k, m, a.data.ptr, m, tau.data.ptr, b.data.ptr, m, workspace.data.ptr, buffersize, dev_info.data.ptr) _check_status(dev_info) # 3. trsm (X = R^{-1} * (Q^T * B)) trsm(cublas_handle, cublas.CUBLAS_SIDE_LEFT, cublas.CUBLAS_FILL_MODE_UPPER, cublas.CUBLAS_OP_N, cublas.CUBLAS_DIAG_NON_UNIT, m, k, 1, a.data.ptr, m, b.data.ptr, m) return b.transpose()
def cholesky(a): """Cholesky decomposition. Decompose a given two-dimensional square matrix into ``L * L.T``, where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate transpose operator. Args: a (cupy.ndarray): The input matrix with dimension ``(N, N)`` Returns: cupy.ndarray: The lower-triangular matrix. .. seealso:: :func:`numpy.linalg.cholesky` """ # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a) util._assert_rank2(a) util._assert_nd_squareness(a) if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char x = a.astype(dtype, order='C', copy=True) n = len(a) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': buffersize = cusolver.spotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float32) cusolver.spotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) elif dtype == 'd': buffersize = cusolver.dpotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.float64) cusolver.dpotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) elif dtype == 'F': buffersize = cusolver.cpotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.complex64) cusolver.cpotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) else: # dtype == 'D': buffersize = cusolver.zpotrf_bufferSize( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=numpy.complex128) cusolver.zpotrf( handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status > 0: raise linalg.LinAlgError( 'The leading minor of order {} ' 'is not positive definite'.format(status)) elif status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') util._tril(x, k=0) return x
def qr(a, mode='reduced'): """QR decomposition. Decompose a given two-dimensional matrix into ``Q * R``, where ``Q`` is an orthonormal and ``R`` is an upper-triangular matrix. Args: a (cupy.ndarray): The input matrix. mode (str): The mode of decomposition. Currently 'reduced', 'complete', 'r', and 'raw' modes are supported. The default mode is 'reduced', in which matrix ``A = (M, N)`` is decomposed into ``Q``, ``R`` with dimensions ``(M, K)``, ``(K, N)``, where ``K = min(M, N)``. Returns: cupy.ndarray, or tuple of ndarray: Although the type of returned object depends on the mode, it returns a tuple of ``(Q, R)`` by default. For details, please see the document of :func:`numpy.linalg.qr`. .. seealso:: :func:`numpy.linalg.qr` """ # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a) util._assert_rank2(a) if mode not in ('reduced', 'complete', 'r', 'raw'): if mode in ('f', 'full', 'e', 'economic'): msg = 'The deprecated mode \'{}\' is not supported'.format(mode) raise ValueError(msg) else: raise ValueError('Unrecognized mode \'{}\''.format(mode)) # support float32, float64, complex64, and complex128 if a.dtype.char in 'fdFD': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char m, n = a.shape x = a.transpose().astype(dtype, order='C', copy=True) mn = min(m, n) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) # compute working space of geqrf and orgqr, and solve R if dtype == 'f': geqrf_bufferSize = cusolver.sgeqrf_bufferSize geqrf = cusolver.sgeqrf elif dtype == 'd': geqrf_bufferSize = cusolver.dgeqrf_bufferSize geqrf = cusolver.dgeqrf elif dtype == 'F': geqrf_bufferSize = cusolver.cgeqrf_bufferSize geqrf = cusolver.cgeqrf elif dtype == 'D': geqrf_bufferSize = cusolver.zgeqrf_bufferSize geqrf = cusolver.zgeqrf else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) buffersize = geqrf_bufferSize(handle, m, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(mn, dtype=dtype) geqrf(handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) status = int(dev_info[0]) if status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') if mode == 'r': r = x[:, :mn].transpose() return util._triu(r) if mode == 'raw': if a.dtype.char == 'f': # The original numpy.linalg.qr returns float64 in raw mode, # whereas the cusolver returns float32. We agree that the # following code would be inappropriate, however, in this time # we explicitly convert them to float64 for compatibility. return x.astype(numpy.float64), tau.astype(numpy.float64) elif a.dtype.char == 'F': # The same applies to complex64 return x.astype(numpy.complex128), tau.astype(numpy.complex128) return x, tau if mode == 'complete' and m > n: mc = m q = cupy.empty((m, m), dtype) else: mc = mn q = cupy.empty((n, m), dtype) q[:n] = x # solve Q if dtype == 'f': orgqr_bufferSize = cusolver.sorgqr_bufferSize orgqr = cusolver.sorgqr elif dtype == 'd': orgqr_bufferSize = cusolver.dorgqr_bufferSize orgqr = cusolver.dorgqr elif dtype == 'F': orgqr_bufferSize = cusolver.cungqr_bufferSize orgqr = cusolver.cungqr elif dtype == 'D': orgqr_bufferSize = cusolver.zungqr_bufferSize orgqr = cusolver.zungqr buffersize = orgqr_bufferSize(handle, m, mc, mn, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=dtype) orgqr(handle, m, mc, mn, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) q = q[:mc].transpose() r = x[:, :mc].transpose() return q, util._triu(r)
def qr(a, mode='reduced'): """QR decomposition. Decompose a given two-dimensional matrix into ``Q * R``, where ``Q`` is an orthonormal and ``R`` is an upper-triangular matrix. Args: a (cupy.ndarray): The input matrix. mode (str): The mode of decomposition. Currently 'reduced', 'complete', 'r', and 'raw' modes are supported. The default mode is 'reduced', in which matrix ``A = (M, N)`` is decomposed into ``Q``, ``R`` with dimensions ``(M, K)``, ``(K, N)``, where ``K = min(M, N)``. Returns: cupy.ndarray, or tuple of ndarray: Although the type of returned object depends on the mode, it returns a tuple of ``(Q, R)`` by default. For details, please see the document of :func:`numpy.linalg.qr`. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.qr` """ # TODO(Saito): Current implementation only accepts two-dimensional arrays _util._assert_cupy_array(a) _util._assert_rank2(a) if mode not in ('reduced', 'complete', 'r', 'raw'): if mode in ('f', 'full', 'e', 'economic'): msg = 'The deprecated mode \'{}\' is not supported'.format(mode) raise ValueError(msg) else: raise ValueError('Unrecognized mode \'{}\''.format(mode)) # support float32, float64, complex64, and complex128 if a.dtype.char in 'fdFD': dtype = a.dtype.char else: dtype = numpy.promote_types(a.dtype.char, 'f').char m, n = a.shape mn = min(m, n) if mn == 0: if mode == 'reduced': return cupy.empty((m, 0), dtype), cupy.empty((0, n), dtype) elif mode == 'complete': return cupy.identity(m, dtype), cupy.empty((m, n), dtype) elif mode == 'r': return cupy.empty((0, n), dtype) else: # mode == 'raw' # compatibility with numpy.linalg.qr dtype = numpy.promote_types(dtype, 'd') return cupy.empty((n, m), dtype), cupy.empty((0, ), dtype) x = a.transpose().astype(dtype, order='C', copy=True) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf_bufferSize = cusolver.sgeqrf_bufferSize geqrf = cusolver.sgeqrf elif dtype == 'd': geqrf_bufferSize = cusolver.dgeqrf_bufferSize geqrf = cusolver.dgeqrf elif dtype == 'F': geqrf_bufferSize = cusolver.cgeqrf_bufferSize geqrf = cusolver.cgeqrf elif dtype == 'D': geqrf_bufferSize = cusolver.zgeqrf_bufferSize geqrf = cusolver.zgeqrf else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) # compute working space of geqrf and solve R buffersize = geqrf_bufferSize(handle, m, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(mn, dtype=dtype) geqrf(handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( geqrf, dev_info) if mode == 'r': r = x[:, :mn].transpose() return _util._triu(r) if mode == 'raw': if a.dtype.char == 'f': # The original numpy.linalg.qr returns float64 in raw mode, # whereas the cusolver returns float32. We agree that the # following code would be inappropriate, however, in this time # we explicitly convert them to float64 for compatibility. return x.astype(numpy.float64), tau.astype(numpy.float64) elif a.dtype.char == 'F': # The same applies to complex64 return x.astype(numpy.complex128), tau.astype(numpy.complex128) return x, tau if mode == 'complete' and m > n: mc = m q = cupy.empty((m, m), dtype) else: mc = mn q = cupy.empty((n, m), dtype) q[:n] = x # compute working space of orgqr and solve Q if dtype == 'f': orgqr_bufferSize = cusolver.sorgqr_bufferSize orgqr = cusolver.sorgqr elif dtype == 'd': orgqr_bufferSize = cusolver.dorgqr_bufferSize orgqr = cusolver.dorgqr elif dtype == 'F': orgqr_bufferSize = cusolver.cungqr_bufferSize orgqr = cusolver.cungqr elif dtype == 'D': orgqr_bufferSize = cusolver.zungqr_bufferSize orgqr = cusolver.zungqr buffersize = orgqr_bufferSize(handle, m, mc, mn, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=dtype) orgqr(handle, m, mc, mn, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( orgqr, dev_info) q = q[:mc].transpose() r = x[:, :mc].transpose() return q, _util._triu(r)
def _lu_factor(a_t, dtype): """Compute pivoted LU decomposition. Decompose a given batch of square matrices. Inputs and outputs are transposed. Args: a_t (cupy.ndarray): The input matrix with dimension ``(..., N, N)``. The dimension condition is not checked. dtype (numpy.dtype): float32, float64, complex64, or complex128. Returns: lu_t (cupy.ndarray): ``L`` without its unit diagonal and ``U`` with dimension ``(..., N, N)``. piv (cupy.ndarray): 1-origin pivot indices with dimension ``(..., N)``. dev_info (cupy.ndarray): ``getrf`` info with dimension ``(...)``. .. seealso:: :func:`scipy.linalg.lu_factor` """ orig_shape = a_t.shape n = orig_shape[-2] # copy is necessary to present `a` to be overwritten. a_t = a_t.astype(dtype, order='C').reshape(-1, n, n) batch_size = a_t.shape[0] ipiv = cupy.empty((batch_size, n), dtype=numpy.int32) dev_info = cupy.empty((batch_size, ), dtype=numpy.int32) # Heuristic condition from some performance test. # TODO(kataoka): autotune use_batched = batch_size * 65536 >= n * n if use_batched: handle = device.get_cublas_handle() lda = n step = n * lda * a_t.itemsize start = a_t.data.ptr stop = start + step * batch_size a_array = cupy.arange(start, stop, step, dtype=cupy.uintp) if dtype == numpy.float32: getrfBatched = cupy.cuda.cublas.sgetrfBatched elif dtype == numpy.float64: getrfBatched = cupy.cuda.cublas.dgetrfBatched elif dtype == numpy.complex64: getrfBatched = cupy.cuda.cublas.cgetrfBatched elif dtype == numpy.complex128: getrfBatched = cupy.cuda.cublas.zgetrfBatched else: assert False getrfBatched(handle, n, a_array.data.ptr, lda, ipiv.data.ptr, dev_info.data.ptr, batch_size) else: handle = device.get_cusolver_handle() if dtype == numpy.float32: getrf_bufferSize = cusolver.sgetrf_bufferSize getrf = cusolver.sgetrf elif dtype == numpy.float64: getrf_bufferSize = cusolver.dgetrf_bufferSize getrf = cusolver.dgetrf elif dtype == numpy.complex64: getrf_bufferSize = cusolver.cgetrf_bufferSize getrf = cusolver.cgetrf elif dtype == numpy.complex128: getrf_bufferSize = cusolver.zgetrf_bufferSize getrf = cusolver.zgetrf else: assert False for i in range(batch_size): a_ptr = a_t[i].data.ptr buffersize = getrf_bufferSize(handle, n, n, a_ptr, n) workspace = cupy.empty(buffersize, dtype=dtype) getrf(handle, n, n, a_ptr, n, workspace.data.ptr, ipiv[i].data.ptr, dev_info[i].data.ptr) return ( a_t.reshape(orig_shape), ipiv.reshape(orig_shape[:-1]), dev_info.reshape(orig_shape[:-2]), )
def _batched_invh(a): """Compute the inverse of an array of Hermitian matrices. This function computes an inverse of a real symmetric or complex hermitian positive-definite matrix using Cholesky factorization. If matrix ``a[i]`` is not positive definite, Cholesky factorization fails and it raises an error. Args: a (cupy.ndarray): Array of real symmetric or complex hermitian matrices with dimension (..., N, N). Returns: cupy.ndarray: The array of inverses of matrices ``a[i]``. """ if not check_availability('potrsBatched'): raise RuntimeError('potrsBatched is not available') if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.promote_types(a.dtype.char, 'f').char if dtype == 'f': potrfBatched = cusolver.spotrfBatched potrsBatched = cusolver.spotrsBatched elif dtype == 'd': potrfBatched = cusolver.dpotrfBatched potrsBatched = cusolver.dpotrsBatched elif dtype == 'F': potrfBatched = cusolver.cpotrfBatched potrsBatched = cusolver.cpotrsBatched elif dtype == 'D': potrfBatched = cusolver.zpotrfBatched potrsBatched = cusolver.zpotrsBatched else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) a = a.astype(dtype, order='C', copy=True) ap = cupy.core.core._mat_ptrs(a) n = a.shape[-1] lda = a.strides[-2] // a.dtype.itemsize handle = device.get_cusolver_handle() uplo = cublas.CUBLAS_FILL_MODE_LOWER batch_size = int(numpy.prod(a.shape[:-2])) dev_info = cupy.empty(batch_size, dtype=numpy.int32) # Cholesky factorization potrfBatched(handle, uplo, n, ap.data.ptr, lda, dev_info.data.ptr, batch_size) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( potrfBatched, dev_info) identity_matrix = cupy.eye(n, dtype=dtype) b = cupy.empty(a.shape, dtype) b[...] = identity_matrix nrhs = b.shape[-1] ldb = b.strides[-2] // a.dtype.itemsize bp = cupy.core.core._mat_ptrs(b) dev_info = cupy.empty(1, dtype=numpy.int32) # NOTE: potrsBatched does not currently support nrhs > 1 (CUDA v10.2) # Solve: A[i] * X[i] = B[i] potrsBatched(handle, uplo, n, nrhs, ap.data.ptr, lda, bp.data.ptr, ldb, dev_info.data.ptr, batch_size) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( potrfBatched, dev_info) return b
def qr(a, mode='reduced'): """QR decomposition. Decompose a given two-dimensional matrix into ``Q * R``, where ``Q`` is an orthonormal and ``R`` is an upper-triangular matrix. Args: a (cupy.ndarray): The input matrix. mode (str): The mode of decomposition. Currently 'reduced', 'complete', 'r', and 'raw' modes are supported. The default mode is 'reduced', in which matrix ``A = (..., M, N)`` is decomposed into ``Q``, ``R`` with dimensions ``(..., M, K)``, ``(..., K, N)``, where ``K = min(M, N)``. Returns: cupy.ndarray, or tuple of ndarray: Although the type of returned object depends on the mode, it returns a tuple of ``(Q, R)`` by default. For details, please see the document of :func:`numpy.linalg.qr`. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.qr` """ _util._assert_cupy_array(a) if mode not in ('reduced', 'complete', 'r', 'raw'): if mode in ('f', 'full', 'e', 'economic'): msg = 'The deprecated mode \'{}\' is not supported'.format(mode) else: msg = 'Unrecognized mode \'{}\''.format(mode) raise ValueError(msg) if a.ndim > 2: return _qr_batched(a, mode) # support float32, float64, complex64, and complex128 dtype, out_dtype = _util.linalg_common_type(a) m, n = a.shape k = min(m, n) if k == 0: if mode == 'reduced': return cupy.empty((m, 0), out_dtype), cupy.empty((0, n), out_dtype) elif mode == 'complete': return cupy.identity(m, out_dtype), cupy.empty((m, n), out_dtype) elif mode == 'r': return cupy.empty((0, n), out_dtype) else: # mode == 'raw' return cupy.empty((n, m), out_dtype), cupy.empty((0,), out_dtype) x = a.transpose().astype(dtype, order='C', copy=True) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': geqrf_bufferSize = cusolver.sgeqrf_bufferSize geqrf = cusolver.sgeqrf elif dtype == 'd': geqrf_bufferSize = cusolver.dgeqrf_bufferSize geqrf = cusolver.dgeqrf elif dtype == 'F': geqrf_bufferSize = cusolver.cgeqrf_bufferSize geqrf = cusolver.cgeqrf elif dtype == 'D': geqrf_bufferSize = cusolver.zgeqrf_bufferSize geqrf = cusolver.zgeqrf else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) # compute working space of geqrf and solve R buffersize = geqrf_bufferSize(handle, m, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=dtype) tau = cupy.empty(k, dtype=dtype) geqrf(handle, m, n, x.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( geqrf, dev_info) if mode == 'r': r = x[:, :k].transpose() return _util._triu(r).astype(out_dtype, copy=False) if mode == 'raw': return ( x.astype(out_dtype, copy=False), tau.astype(out_dtype, copy=False)) if mode == 'complete' and m > n: mc = m q = cupy.empty((m, m), dtype) else: mc = k q = cupy.empty((n, m), dtype) q[:n] = x # compute working space of orgqr and solve Q if dtype == 'f': orgqr_bufferSize = cusolver.sorgqr_bufferSize orgqr = cusolver.sorgqr elif dtype == 'd': orgqr_bufferSize = cusolver.dorgqr_bufferSize orgqr = cusolver.dorgqr elif dtype == 'F': orgqr_bufferSize = cusolver.cungqr_bufferSize orgqr = cusolver.cungqr elif dtype == 'D': orgqr_bufferSize = cusolver.zungqr_bufferSize orgqr = cusolver.zungqr buffersize = orgqr_bufferSize( handle, m, mc, k, q.data.ptr, m, tau.data.ptr) workspace = cupy.empty(buffersize, dtype=dtype) orgqr( handle, m, mc, k, q.data.ptr, m, tau.data.ptr, workspace.data.ptr, buffersize, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( orgqr, dev_info) q = q[:mc].transpose() r = x[:, :mc].transpose() return ( q.astype(out_dtype, copy=False), _util._triu(r).astype(out_dtype, copy=False))
def gels(a, b): """Compute least square solution using cusolverDn<t1><t2>gels(). Computes the least square solution to a system of ``ax = b``. Args: a (cupy.ndarray): The matrix with dimension ``(M, N)``. b (cupy.ndarray): The matrix with dimension ``(M)`` or ``(M, K)``. Returns: cupy.ndarray: The matrix with dimension ``(N)`` or ``(N, K)``. """ if not check_availability('gels'): raise RuntimeError('gels is not available.') if a.ndim != 2: raise ValueError('a.ndim must be 2 (actual:{})'.format(a.ndim)) if b.ndim == 1: nrhs = 1 elif b.ndim == 2: nrhs = b.shape[1] else: raise ValueError('b.ndim must be 1 or 2 (actual: {})'.format(b.ndim)) if a.shape[0] != b.shape[0]: raise ValueError('shape mismatch (a:{}, b:{}).'. format(a.shape, b.shape)) if a.dtype != b.dtype: raise ValueError('dtype mismatch (a:{}, b:{}).'. format(a.dtype, b.dtype)) m, n = a.shape if m < n: raise ValueError('m must be equal to or greater than n.') max_mn = max(m, n) b_ndim = b.ndim compute_type = _linalg.get_compute_type(a.dtype) if a.dtype.char in 'fd': if a.dtype.char == 'f': t1 = t2 = 's' else: t1 = t2 = 'd' if compute_type == _linalg.COMPUTE_TYPE_FP16: t2 = 'h' elif compute_type == _linalg.COMPUTE_TYPE_TF32: t2 = 'x' elif compute_type == _linalg.COMPUTE_TYPE_FP32: t2 = 's' elif a.dtype.char in 'FD': if a.dtype.char == 'F': t1 = t2 = 'c' else: t1 = t2 = 'z' if compute_type == _linalg.COMPUTE_TYPE_FP16: t2 = 'k' elif compute_type == _linalg.COMPUTE_TYPE_TF32: t2 = 'y' elif compute_type == _linalg.COMPUTE_TYPE_FP32: t2 = 'c' else: raise ValueError('unsupported dtype (actual:{})'.format(a.dtype)) solver_name = t1 + t2 + 'gels' solver = getattr(_cusolver, solver_name) helper = getattr(_cusolver, solver_name + '_bufferSize') a = a.copy(order='F') org_nrhs = nrhs if m > n and nrhs == 1: # Note: this is workaround as there is bug in cusolverDn<T1><T2>gels() # of CUDA 11.0/11.1 and it returns CUSOLVER_STATUS_IRS_NOT_SUPPORTED # when m > n and nrhs == 1. nrhs = 2 bb = b.reshape(m, 1) b = _cupy.empty((max_mn, nrhs), dtype=a.dtype, order='F') b[:m, :] = bb else: b = b.copy(order='F') x = _cupy.empty((max_mn, nrhs), dtype=a.dtype, order='F') dinfo = _cupy.empty(1, dtype=_numpy.int32) handle = _device.get_cusolver_handle() lwork = helper(handle, m, n, nrhs, a.data.ptr, m, b.data.ptr, m, x.data.ptr, max_mn, 0) dwork = _cupy.empty(lwork, dtype=_numpy.int8) niters = solver(handle, m, n, nrhs, a.data.ptr, m, b.data.ptr, m, x.data.ptr, max_mn, dwork.data.ptr, lwork, dinfo.data.ptr) if niters < 0: if niters <= -50: _warnings.warn('gels reached maximum allowed iterations.') else: raise RuntimeError('gels has failed ({}).'.format(niters)) x = x[:n] if org_nrhs != nrhs: x = x[:, :org_nrhs] if b_ndim == 1: x = x.reshape(n) return x
def cholesky(a): """Cholesky decomposition. Decompose a given two-dimensional square matrix into ``L * L.T``, where ``L`` is a lower-triangular matrix and ``.T`` is a conjugate transpose operator. Args: a (cupy.ndarray): The input matrix with dimension ``(N, N)`` Returns: cupy.ndarray: The lower-triangular matrix. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.cholesky` """ _util._assert_cupy_array(a) _util._assert_nd_squareness(a) if a.ndim > 2: return _potrf_batched(a) if a.dtype.char == 'f' or a.dtype.char == 'd': dtype = a.dtype.char else: dtype = numpy.promote_types(a.dtype.char, 'f').char x = a.astype(dtype, order='C', copy=True) n = len(a) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if dtype == 'f': potrf = cusolver.spotrf potrf_bufferSize = cusolver.spotrf_bufferSize elif dtype == 'd': potrf = cusolver.dpotrf potrf_bufferSize = cusolver.dpotrf_bufferSize elif dtype == 'F': potrf = cusolver.cpotrf potrf_bufferSize = cusolver.cpotrf_bufferSize else: # dtype == 'D': potrf = cusolver.zpotrf potrf_bufferSize = cusolver.zpotrf_bufferSize buffersize = potrf_bufferSize(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n) workspace = cupy.empty(buffersize, dtype=dtype) potrf(handle, cublas.CUBLAS_FILL_MODE_UPPER, n, x.data.ptr, n, workspace.data.ptr, buffersize, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( potrf, dev_info) _util._tril(x, k=0) return x
def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True): """Solve an equation system, ``a * x = b``, given the LU factorization of ``a`` Args: lu_and_piv (tuple): LU factorization of matrix ``a`` (``(M, M)``) together with pivot indices. b (cupy.ndarray): The matrix with dimension ``(M,)`` or ``(M, N)``. trans ({0, 1, 2}): Type of system to solve: ======== ========= trans system ======== ========= 0 a x = b 1 a^T x = b 2 a^H x = b ======== ========= overwrite_b (bool): Allow overwriting data in b (may enhance performance) check_finite (bool): Whether to check that the input matrices contain only finite numbers. Disabling may give a performance gain, but may result in problems (crashes, non-termination) if the inputs do contain infinities or NaNs. Returns: cupy.ndarray: The matrix with dimension ``(M,)`` or ``(M, N)``. .. seealso:: :func:`scipy.linalg.lu_solve` """ (lu, ipiv) = lu_and_piv util._assert_cupy_array(lu) util._assert_rank2(lu) util._assert_nd_squareness(lu) m = lu.shape[0] if m != b.shape[0]: raise ValueError('incompatible dimensions.') dtype = lu.dtype if dtype.char == 'f': getrs = cusolver.sgetrs elif dtype.char == 'd': getrs = cusolver.dgetrs else: raise NotImplementedError('Only float32 and float64 are supported.') if trans == 0: trans = cublas.CUBLAS_OP_N elif trans == 1: trans = cublas.CUBLAS_OP_T elif trans == 2: trans = cublas.CUBLAS_OP_C else: raise ValueError('unknown trans') lu = lu.astype(dtype, order='F', copy=False) ipiv = ipiv.astype(ipiv.dtype, order='F', copy=True) # cuSolver uses 1-origin while SciPy uses 0-origin ipiv += 1 b = b.astype(dtype, order='F', copy=(not overwrite_b)) if check_finite: if lu.dtype.kind == 'f' and not cupy.isfinite(lu).all(): raise ValueError( 'array must not contain infs or NaNs.\n' 'Note that when a singular matrix is given, unlike ' 'scipy.linalg.lu_factor, cupyx.scipy.linalg.lu_factor ' 'returns an array containing NaN.') if b.dtype.kind == 'f' and not cupy.isfinite(b).all(): raise ValueError('array must not contain infs or NaNs') n = 1 if b.ndim == 1 else b.shape[1] cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) # solve for the inverse getrs(cusolver_handle, trans, m, n, lu.data.ptr, m, ipiv.data.ptr, b.data.ptr, m, dev_info.data.ptr) if dev_info[0] < 0: raise ValueError('illegal value in %d-th argument of ' 'internal getrs (lu_solve)' % -dev_info[0]) return b
def svd(a, full_matrices=True, compute_uv=True): """Singular Value Decomposition. Factorizes the matrix ``a`` as ``u * np.diag(s) * v``, where ``u`` and ``v`` are unitary and ``s`` is an one-dimensional array of ``a``'s singular values. Args: a (cupy.ndarray): The input matrix with dimension ``(..., M, N)``. full_matrices (bool): If True, it returns u and v with dimensions ``(..., M, M)`` and ``(..., N, N)``. Otherwise, the dimensions of u and v are ``(..., M, K)`` and ``(..., K, N)``, respectively, where ``K = min(M, N)``. compute_uv (bool): If ``False``, it only returns singular values. Returns: tuple of :class:`cupy.ndarray`: A tuple of ``(u, s, v)`` such that ``a = u * np.diag(s) * v``. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. note:: On CUDA, when ``a.ndim > 2`` and the matrix dimensions <= 32, a fast code path based on Jacobian method (``gesvdj``) is taken. Otherwise, a QR method (``gesvd``) is used. On ROCm, there is no such a fast code path that switches the underlying algorithm. .. seealso:: :func:`numpy.linalg.svd` """ _util._assert_cupy_array(a) # Cast to float32 or float64 a_dtype = numpy.promote_types(a.dtype.char, 'f').char if a_dtype == 'f': s_dtype = 'f' elif a_dtype == 'd': s_dtype = 'd' elif a_dtype == 'F': s_dtype = 'f' else: # a_dtype == 'D': a_dtype = 'D' s_dtype = 'd' if a.ndim > 2: return _svd_batched(a, a_dtype, full_matrices, compute_uv) # Remark 1: gesvd only supports m >= n (WHAT?) # Remark 2: gesvd returns matrix U and V^H n, m = a.shape if m == 0 or n == 0: s = cupy.empty((0, ), s_dtype) if compute_uv: if full_matrices: u = cupy.eye(n, dtype=a_dtype) vt = cupy.eye(m, dtype=a_dtype) else: u = cupy.empty((n, 0), dtype=a_dtype) vt = cupy.empty((0, m), dtype=a_dtype) return u, s, vt else: return s # `a` must be copied because xgesvd destroys the matrix if m >= n: x = a.astype(a_dtype, order='C', copy=True) trans_flag = False else: m, n = a.shape x = a.transpose().astype(a_dtype, order='C', copy=True) trans_flag = True k = n # = min(m, n) where m >= n is ensured above if compute_uv: if full_matrices: u = cupy.empty((m, m), dtype=a_dtype) vt = x[:, :n] job_u = ord('A') job_vt = ord('O') else: u = x vt = cupy.empty((k, n), dtype=a_dtype) job_u = ord('O') job_vt = ord('S') u_ptr, vt_ptr = u.data.ptr, vt.data.ptr else: u_ptr, vt_ptr = 0, 0 # Use nullptr job_u = ord('N') job_vt = ord('N') s = cupy.empty(k, dtype=s_dtype) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if a_dtype == 'f': gesvd = cusolver.sgesvd gesvd_bufferSize = cusolver.sgesvd_bufferSize elif a_dtype == 'd': gesvd = cusolver.dgesvd gesvd_bufferSize = cusolver.dgesvd_bufferSize elif a_dtype == 'F': gesvd = cusolver.cgesvd gesvd_bufferSize = cusolver.cgesvd_bufferSize else: # a_dtype == 'D': gesvd = cusolver.zgesvd gesvd_bufferSize = cusolver.zgesvd_bufferSize buffersize = gesvd_bufferSize(handle, m, n) workspace = cupy.empty(buffersize, dtype=a_dtype) if not runtime.is_hip: # rwork can be NULL if the information from supperdiagonal isn't needed # https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-gesvd # noqa rwork_ptr = 0 else: rwork = cupy.empty(min(m, n) - 1, dtype=s_dtype) rwork_ptr = rwork.data.ptr gesvd(handle, job_u, job_vt, m, n, x.data.ptr, m, s.data.ptr, u_ptr, m, vt_ptr, n, workspace.data.ptr, buffersize, rwork_ptr, dev_info.data.ptr) cupy.linalg._util._check_cusolver_dev_info_if_synchronization_allowed( gesvd, dev_info) # Note that the returned array may need to be transposed # depending on the structure of an input if compute_uv: if trans_flag: return u.transpose(), s, vt.transpose() else: return vt, s, u else: return s
def svd(a, full_matrices=True, compute_uv=True): """Singular Value Decomposition. Factorizes the matrix ``a`` as ``u * np.diag(s) * v``, where ``u`` and ``v`` are unitary and ``s`` is an one-dimensional array of ``a``'s singular values. Args: a (cupy.ndarray): The input matrix with dimension ``(M, N)``. full_matrices (bool): If True, it returns u and v with dimensions ``(M, M)`` and ``(N, N)``. Otherwise, the dimensions of u and v are respectively ``(M, K)`` and ``(K, N)``, where ``K = min(M, N)``. compute_uv (bool): If ``False``, it only returns singular values. Returns: tuple of :class:`cupy.ndarray`: A tuple of ``(u, s, v)`` such that ``a = u * np.diag(s) * v``. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.svd` """ # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a) util._assert_rank2(a) # Cast to float32 or float64 a_dtype = numpy.promote_types(a.dtype.char, 'f').char if a_dtype == 'f': s_dtype = 'f' elif a_dtype == 'd': s_dtype = 'd' elif a_dtype == 'F': s_dtype = 'f' else: # a_dtype == 'D': a_dtype = 'D' s_dtype = 'd' # Remark 1: gesvd only supports m >= n (WHAT?) # Remark 2: gesvd only supports jobu = 'A' and jobvt = 'A' # Remark 3: gesvd returns matrix U and V^H # Remark 4: Remark 2 is removed since cuda 8.0 (new!) n, m = a.shape # `a` must be copied because xgesvd destroys the matrix if m >= n: x = a.astype(a_dtype, order='C', copy=True) trans_flag = False else: m, n = a.shape x = a.transpose().astype(a_dtype, order='C', copy=True) trans_flag = True mn = min(m, n) if compute_uv: if full_matrices: u = cupy.empty((m, m), dtype=a_dtype) vt = cupy.empty((n, n), dtype=a_dtype) else: u = cupy.empty((mn, m), dtype=a_dtype) vt = cupy.empty((mn, n), dtype=a_dtype) u_ptr, vt_ptr = u.data.ptr, vt.data.ptr else: u_ptr, vt_ptr = 0, 0 # Use nullptr s = cupy.empty(mn, dtype=s_dtype) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if compute_uv: job = ord('A') if full_matrices else ord('S') else: job = ord('N') if a_dtype == 'f': gesvd = cusolver.sgesvd gesvd_bufferSize = cusolver.sgesvd_bufferSize elif a_dtype == 'd': gesvd = cusolver.dgesvd gesvd_bufferSize = cusolver.dgesvd_bufferSize elif a_dtype == 'F': gesvd = cusolver.cgesvd gesvd_bufferSize = cusolver.cgesvd_bufferSize else: # a_dtype == 'D': gesvd = cusolver.zgesvd gesvd_bufferSize = cusolver.zgesvd_bufferSize buffersize = gesvd_bufferSize(handle, m, n) workspace = cupy.empty(buffersize, dtype=a_dtype) gesvd(handle, job, job, m, n, x.data.ptr, m, s.data.ptr, u_ptr, m, vt_ptr, n, workspace.data.ptr, buffersize, 0, dev_info.data.ptr) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( gesvd, dev_info) # Note that the returned array may need to be transporsed # depending on the structure of an input if compute_uv: if trans_flag: return u.transpose(), s, vt.transpose() else: return vt, s, u else: return s
def svd(a, full_matrices=True, compute_uv=True): """Singular Value Decomposition. Factorizes the matrix ``a`` as ``u * np.diag(s) * v``, where ``u`` and ``v`` are unitary and ``s`` is an one-dimensional array of ``a``'s singular values. Args: a (cupy.ndarray): The input matrix with dimension ``(M, N)``. full_matrices (bool): If True, it returns u and v with dimensions ``(M, M)`` and ``(N, N)``. Otherwise, the dimensions of u and v are respectively ``(M, K)`` and ``(K, N)``, where ``K = min(M, N)``. compute_uv (bool): If ``False``, it only returns singular values. Returns: tuple of :class:`cupy.ndarray`: A tuple of ``(u, s, v)`` such that ``a = u * np.diag(s) * v``. .. seealso:: :func:`numpy.linalg.svd` """ # TODO(Saito): Current implementation only accepts two-dimensional arrays util._assert_cupy_array(a) util._assert_rank2(a) # Cast to float32 or float64 a_dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char if a_dtype == 'f': s_dtype = 'f' elif a_dtype == 'd': s_dtype = 'd' elif a_dtype == 'F': s_dtype = 'f' else: # a_dtype == 'D': a_dtype = 'D' s_dtype = 'd' # Remark 1: gesvd only supports m >= n (WHAT?) # Remark 2: gesvd only supports jobu = 'A' and jobvt = 'A' # Remark 3: gesvd returns matrix U and V^H # Remark 4: Remark 2 is removed since cuda 8.0 (new!) n, m = a.shape # `a` must be copied because xgesvd destroys the matrix if m >= n: x = a.astype(a_dtype, order='C', copy=True) trans_flag = False else: m, n = a.shape x = a.transpose().astype(a_dtype, order='C', copy=True) trans_flag = True mn = min(m, n) if compute_uv: if full_matrices: u = cupy.empty((m, m), dtype=a_dtype) vt = cupy.empty((n, n), dtype=a_dtype) else: u = cupy.empty((mn, m), dtype=a_dtype) vt = cupy.empty((mn, n), dtype=a_dtype) u_ptr, vt_ptr = u.data.ptr, vt.data.ptr else: u_ptr, vt_ptr = 0, 0 # Use nullptr s = cupy.empty(mn, dtype=s_dtype) handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) if compute_uv: job = ord('A') if full_matrices else ord('S') else: job = ord('N') if a_dtype == 'f': buffersize = cusolver.sgesvd_bufferSize(handle, m, n) workspace = cupy.empty(buffersize, dtype=a_dtype) cusolver.sgesvd( handle, job, job, m, n, x.data.ptr, m, s.data.ptr, u_ptr, m, vt_ptr, n, workspace.data.ptr, buffersize, 0, dev_info.data.ptr) elif a_dtype == 'd': buffersize = cusolver.dgesvd_bufferSize(handle, m, n) workspace = cupy.empty(buffersize, dtype=a_dtype) cusolver.dgesvd( handle, job, job, m, n, x.data.ptr, m, s.data.ptr, u_ptr, m, vt_ptr, n, workspace.data.ptr, buffersize, 0, dev_info.data.ptr) elif a_dtype == 'F': buffersize = cusolver.cgesvd_bufferSize(handle, m, n) workspace = cupy.empty(buffersize, dtype=a_dtype) cusolver.cgesvd( handle, job, job, m, n, x.data.ptr, m, s.data.ptr, u_ptr, m, vt_ptr, n, workspace.data.ptr, buffersize, 0, dev_info.data.ptr) else: # a_dtype == 'D': buffersize = cusolver.zgesvd_bufferSize(handle, m, n) workspace = cupy.empty(buffersize, dtype=a_dtype) cusolver.zgesvd( handle, job, job, m, n, x.data.ptr, m, s.data.ptr, u_ptr, m, vt_ptr, n, workspace.data.ptr, buffersize, 0, dev_info.data.ptr) status = int(dev_info[0]) if status > 0: raise linalg.LinAlgError( 'SVD computation does not converge') elif status < 0: raise linalg.LinAlgError( 'Parameter error (maybe caused by a bug in cupy.linalg?)') # Note that the returned array may need to be transporsed # depending on the structure of an input if compute_uv: if trans_flag: return u.transpose(), s, vt.transpose() else: return vt, s, u else: return s
def gesvda(a, compute_uv=True): """Singular value decomposition using cusolverDn<t>gesvdaStridedBatched(). Factorizes the matrix ``a`` into two unitary matrices ``u`` and ``v`` and a singular values vector ``s`` such that ``a == u @ diag(s) @ v*``. Args: a (cupy.ndarray): The input matrix with dimension ``(.., M, N)``. compute_uv (bool): If ``False``, it only returns singular values. Returns: tuple of :class:`cupy.ndarray`: A tuple of ``(u, s, v)``. """ if not check_availability('gesvda'): raise RuntimeError('gesvda is not available.') assert a.ndim >= 2 a_ndim = a.ndim a_shape = a.shape m, n = a_shape[-2:] assert m >= n if a.dtype == 'f': helper = cusolver.sgesvdaStridedBatched_bufferSize solver = cusolver.sgesvdaStridedBatched s_dtype = 'f' elif a.dtype == 'd': helper = cusolver.dgesvdaStridedBatched_bufferSize solver = cusolver.dgesvdaStridedBatched s_dtype = 'd' elif a.dtype == 'F': helper = cusolver.cgesvdaStridedBatched_bufferSize solver = cusolver.cgesvdaStridedBatched s_dtype = 'f' elif a.dtype == 'D': helper = cusolver.zgesvdaStridedBatched_bufferSize solver = cusolver.zgesvdaStridedBatched s_dtype = 'd' else: raise TypeError handle = device.get_cusolver_handle() if compute_uv: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR rank = min(m, n) if a_ndim == 2: batch_size = 1 else: batch_size = numpy.array(a_shape[:-2]).prod().item() a = a.reshape((batch_size, m, n)) a = cupy.ascontiguousarray(a.transpose(0, 2, 1)) lda = m stride_a = lda * n s = cupy.empty((batch_size, rank), dtype=s_dtype) stride_s = rank ldu = m ldv = n u = cupy.empty((batch_size, rank, ldu), dtype=a.dtype, order='C') v = cupy.empty((batch_size, rank, ldv), dtype=a.dtype, order='C') stride_u = rank * ldu stride_v = rank * ldv lwork = helper(handle, jobz, rank, m, n, a.data.ptr, lda, stride_a, s.data.ptr, stride_s, u.data.ptr, ldu, stride_u, v.data.ptr, ldv, stride_v, batch_size) work = cupy.empty((lwork, ), dtype=a.dtype) info = cupy.empty((batch_size, ), dtype=numpy.int32) r_norm = numpy.empty((batch_size, ), dtype=numpy.float64) solver(handle, jobz, rank, m, n, a.data.ptr, lda, stride_a, s.data.ptr, stride_s, u.data.ptr, ldu, stride_u, v.data.ptr, ldv, stride_v, work.data.ptr, lwork, info.data.ptr, r_norm.ctypes.data, batch_size) s = s.reshape(a_shape[:-2] + (s.shape[-1], )) if not compute_uv: return s u = u.transpose(0, 2, 1) v = v.transpose(0, 2, 1) u = u.reshape(a_shape[:-2] + (u.shape[-2:])) v = v.reshape(a_shape[:-2] + (v.shape[-2:])) return u, s, v
def inv(a): """Computes the inverse of a matrix. This function computes matrix ``a_inv`` from n-dimensional regular matrix ``a`` such that ``dot(a, a_inv) == eye(n)``. Args: a (cupy.ndarray): The regular matrix Returns: cupy.ndarray: The inverse of a matrix. .. warning:: This function calls one or more cuSOLVER routine(s) which may yield invalid results if input conditions are not met. To detect these invalid results, you can set the `linalg` configuration to a value that is not `ignore` in :func:`cupyx.errstate` or :func:`cupyx.seterr`. .. seealso:: :func:`numpy.linalg.inv` """ if a.ndim >= 3: return _batched_inv(a) # to prevent `a` to be overwritten a = a.copy() util._assert_cupy_array(a) util._assert_rank2(a) util._assert_nd_squareness(a) # support float32, float64, complex64, and complex128 if a.dtype.char in 'fdFD': dtype = a.dtype.char else: dtype = numpy.find_common_type((a.dtype.char, 'f'), ()).char cusolver_handle = device.get_cusolver_handle() dev_info = cupy.empty(1, dtype=numpy.int32) ipiv = cupy.empty((a.shape[0], 1), dtype=numpy.intc) if dtype == 'f': getrf = cusolver.sgetrf getrf_bufferSize = cusolver.sgetrf_bufferSize getrs = cusolver.sgetrs elif dtype == 'd': getrf = cusolver.dgetrf getrf_bufferSize = cusolver.dgetrf_bufferSize getrs = cusolver.dgetrs elif dtype == 'F': getrf = cusolver.cgetrf getrf_bufferSize = cusolver.cgetrf_bufferSize getrs = cusolver.cgetrs elif dtype == 'D': getrf = cusolver.zgetrf getrf_bufferSize = cusolver.zgetrf_bufferSize getrs = cusolver.zgetrs else: msg = ('dtype must be float32, float64, complex64 or complex128' ' (actual: {})'.format(a.dtype)) raise ValueError(msg) m = a.shape[0] buffersize = getrf_bufferSize(cusolver_handle, m, m, a.data.ptr, m) workspace = cupy.empty(buffersize, dtype=dtype) # LU factorization getrf(cusolver_handle, m, m, a.data.ptr, m, workspace.data.ptr, ipiv.data.ptr, dev_info.data.ptr) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( getrf, dev_info) b = cupy.eye(m, dtype=dtype) # solve for the inverse getrs(cusolver_handle, 0, m, m, a.data.ptr, m, ipiv.data.ptr, b.data.ptr, m, dev_info.data.ptr) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( getrs, dev_info) return b
def gesvdj(a, full_matrices=True, compute_uv=True, overwrite_a=False): """Singular value decomposition using cusolverDn<t>gesvdj(). Factorizes the matrix ``a`` into two unitary matrices ``u`` and ``v`` and a singular values vector ``s`` such that ``a == u @ diag(s) @ v*``. Args: a (cupy.ndarray): The input matrix with dimension ``(M, N)``. full_matrices (bool): If True, it returns u and v with dimensions ``(M, M)`` and ``(N, N)``. Otherwise, the dimensions of u and v are respectively ``(M, K)`` and ``(K, N)``, where ``K = min(M, N)``. compute_uv (bool): If ``False``, it only returns singular values. overwrite_a (bool): If ``True``, matrix ``a`` might be overwritten. Returns: tuple of :class:`cupy.ndarray`: A tuple of ``(u, s, v)``. """ if not check_availability('gesvdj'): raise RuntimeError('gesvdj is not available.') if a.ndim == 3: return _gesvdj_batched(a, full_matrices, compute_uv, overwrite_a) assert a.ndim == 2 if a.dtype == 'f': helper = cusolver.sgesvdj_bufferSize solver = cusolver.sgesvdj s_dtype = 'f' elif a.dtype == 'd': helper = cusolver.dgesvdj_bufferSize solver = cusolver.dgesvdj s_dtype = 'd' elif a.dtype == 'F': helper = cusolver.cgesvdj_bufferSize solver = cusolver.cgesvdj s_dtype = 'f' elif a.dtype == 'D': helper = cusolver.zgesvdj_bufferSize solver = cusolver.zgesvdj s_dtype = 'd' else: raise TypeError handle = device.get_cusolver_handle() m, n = a.shape a = cupy.array(a, order='F', copy=not overwrite_a) lda = m mn = min(m, n) s = cupy.empty(mn, dtype=s_dtype) ldu = m ldv = n if compute_uv: jobz = cusolver.CUSOLVER_EIG_MODE_VECTOR else: jobz = cusolver.CUSOLVER_EIG_MODE_NOVECTOR full_matrices = False if full_matrices: econ = 0 u = cupy.empty((ldu, m), dtype=a.dtype, order='F') v = cupy.empty((ldv, n), dtype=a.dtype, order='F') else: econ = 1 u = cupy.empty((ldu, mn), dtype=a.dtype, order='F') v = cupy.empty((ldv, mn), dtype=a.dtype, order='F') params = cusolver.createGesvdjInfo() lwork = helper(handle, jobz, econ, m, n, a.data.ptr, lda, s.data.ptr, u.data.ptr, ldu, v.data.ptr, ldv, params) work = cupy.empty(lwork, dtype=a.dtype) info = cupy.empty(1, dtype=numpy.int32) solver(handle, jobz, econ, m, n, a.data.ptr, lda, s.data.ptr, u.data.ptr, ldu, v.data.ptr, ldv, work.data.ptr, lwork, info.data.ptr, params) cupy.linalg.util._check_cusolver_dev_info_if_synchronization_allowed( gesvdj, info) cusolver.destroyGesvdjInfo(params) if compute_uv: return u, s, v else: return s