def pculaConfigInit(config): """ Initialize pCULA configuration structure to sensible defaults. """ status = _libpcula.pculaConfigInit(ctypes.byref(config)) culaCheckStatus(status)
def pculaZpotrf(config, uplo, n, a, lda): """ Cholesky decomposition. """ status = _libpcula.pculaZpotrf(ctypes.byref(config), uplo, n, int(a), lda) culaCheckStatus(status)
def pculaZpotrs(config, uplo, n, nrhs, a, lda, b, ldb): """ Cholesky solve. """ status = _libpcula.pculaZpotrs(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status)
def pculaZgetrf(config, m, n, a, lda, ipiv): """ LU decomposition. """ status = _libpcula.pculaZgetrf(ctypes.byref(config), m, n, int(a), lda, int(ipiv)) culaCheckStatus(status)
def pculaZgesv(config, n, nrhs, a, lda, ipiv, b, ldb): """ General system solve using LU decomposition. """ status = _libpcula.pculaZgesv(ctypes.byref(config), n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status)
def pculaZtrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb): """ Triangular system solve. """ status = _libpcula.pculaZtrsm(ctypes.byref(config), side, uplo, transa, diag, m, n, alpha, int(a), lda, int(b), ldb) culaCheckStatus(status)
def pculaZposv(config, uplo, n, nrhs, a, lda, b, ldb): """ QR factorization. """ status = _libpcula.pculaZposv(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status)
def pculaZgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb): """ LU solve. """ status = _libpcula.pculaZgetrs(ctypes.byref(config), trans, n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status)
def pculaDgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ status = _libpcula.pculaDgemm(ctypes.byref(config), transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status)
def svd(a_gpu, jobu='A', jobvt='A'): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. jobu : {'A', 'S', 'O', 'N'} If 'A', return the full `u` matrix with shape `(m, m)`. If 'S', return the `u` matrix with shape `(m, k)`. If 'O', return the `u` matrix with shape `(m, k) without allocating a new matrix. If 'N', don't return `u`. jobvt : {'A', 'S', 'O', 'N'} If 'A', return the full `vh` matrix with shape `(n, n)`. If 'S', return the `vh` matrix with shape `(k, n)`. If 'O', return the `vh` matrix with shape `(k, n) without allocating a new matrix. If 'N', don't return `vh`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `jobu`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `jobvt`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix regardless of the values of `jobu` and `jobvt`. Only one of `jobu` or `jobvt` may be set to `O`, and then only for a square matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S') >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ if not _has_cula: raise NotImplementError('CULA not installed') # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCgesvd elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZgesvd elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) # Since the input matrix is transposed, jobu and jobvt must also # be switched because the computed matrices will be returned in # reversed order: jobvt, jobu = jobu, jobvt # Set the leading dimension of the input matrix: lda = max(1, m) # Allocate the array of singular values: s_gpu = gpuarray.empty(min(m, n), real_type) # Set the leading dimension and allocate u: jobu = upper(jobu) jobvt = upper(jobvt) ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), data_type) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), data_type) elif jobu == 'O': if not square: raise ValueError('in-place computation of singular vectors ' + 'of non-square matrix not allowed') ldu = 1 u_gpu = a_gpu else: ldu = 1 u_gpu = gpuarray.empty((), data_type) # Set the leading dimension and allocate vh: if jobvt == 'A': ldvt = n vh_gpu = gpuarray.empty((n, n), data_type) elif jobvt == 'S': ldvt = min(m, n) vh_gpu = gpuarray.empty((n, ldvt), data_type) elif jobvt == 'O': if jobu == 'O': raise ValueError('jobu and jobvt cannot both be O') if not square: raise ValueError('in-place computation of singular vectors ' + 'of non-square matrix not allowed') ldvt = 1 vh_gpu = a_gpu else: ldvt = 1 vh_gpu = gpuarray.empty((), data_type) # Compute SVD and check error status: status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vh_gpu.gpudata), ldvt) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # Since the input is assumed to be transposed, it is necessary to # return the computed matrices in reverse order: if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']: return vh_gpu, s_gpu, u_gpu elif jobu == 'N' and jobvt != 'N': return vh_gpu, s_gpu elif jobu != 'N' and jobvt == 'N': return s_gpu, u_gpu else: return s_gpu
def cho_factor(a_gpu, uplo='L'): """ Cholesky factorisation Performs an in-place cholesky factorisation on the matrix 'a' such that a = x*x.T or x.T*x, if the lower='L' or upper='U' triangle of 'a' is used, respectively. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. uplo: use the upper='U' or lower='L' (default) triangle of 'a' Returns ------- a: Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> cho_factor(a_gpu) """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCpotrf elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSpotrf if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZpotrf elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDpotrf else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) if (n!=m): raise ValueError('Matrix must be symmetric positive-definite') # Set the leading dimension of the input matrix: lda = max(1, m) status = cula_func(uplo, n, int(a_gpu.gpudata), lda) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers()
def cho_solve(a_gpu, b_gpu, uplo='L'): """ Cholesky solver Solve a system of equations via cholesky factorisation, i.e. a*x = b. Overwrites 'b' to give 'inv(a)*b', and overwrites the chosen triangle of 'a' with factorised triangle Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. b : pycuda.gpuarray.GPUArray Input matrix of shape `(m, 1)` to decompose. uplo: use the upper='U' or lower='L' (default) triangle of 'a' Returns ------- a: Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> b = np.array([11.,19.]) >>> b = np.asarray(b, np.float64) >>> b_gpu = gpuarray.to_gpu(b) >>> cho_solve(a_gpu,b_gpu) """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCpotrf elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSpotrf if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZpotrf elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDpotrf else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: na, ma = a_gpu.shape square = (na == ma) if (na!=ma): raise ValueError('Matrix must be symmetric positive-definite') # Set the leading dimension of the input matrix: lda = max(1, ma) ldb = lda # Assuming we are only solving for a vector. Hence, nrhs = 1 status = cula_func(uplo, na, 1, int(a_gpu.gpudata), lda, int(b_gpu.gpudata), ldb) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers()
def svd(a_gpu, jobu='A', jobvt='A'): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. jobu : {'A', 'S', 'O', 'N'} If 'A', return the full `u` matrix with shape `(m, m)`. If 'S', return the `u` matrix with shape `(m, k)`. If 'O', return the `u` matrix with shape `(m, k) without allocating a new matrix. If 'N', don't return `u`. jobvt : {'A', 'S', 'O', 'N'} If 'A', return the full `vh` matrix with shape `(n, n)`. If 'S', return the `vh` matrix with shape `(k, n)`. If 'O', return the `vh` matrix with shape `(k, n) without allocating a new matrix. If 'N', don't return `vh`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `jobu`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `jobvt`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix regardless of the values of `jobu` and `jobvt`. Only one of `jobu` or `jobvt` may be set to `O`, and then only for a square matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S') >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ if not _has_cula: raise NotImplementError('CULA not installed') # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCgesvd elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZgesvd elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) # Since the input matrix is transposed, jobu and jobvt must also # be switched because the computed matrices will be returned in # reversed order: jobvt, jobu = jobu, jobvt # Set the leading dimension of the input matrix: lda = max(1, m) # Allocate the array of singular values: s_gpu = gpuarray.empty(min(m, n), real_type) # Set the leading dimension and allocate u: jobu = upper(jobu) jobvt = upper(jobvt) ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), data_type) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), data_type) elif jobu == 'O': if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldu = 1 u_gpu = a_gpu else: ldu = 1 u_gpu = gpuarray.empty((), data_type) # Set the leading dimension and allocate vh: if jobvt == 'A': ldvt = n vh_gpu = gpuarray.empty((n, n), data_type) elif jobvt == 'S': ldvt = min(m, n) vh_gpu = gpuarray.empty((n, ldvt), data_type) elif jobvt == 'O': if jobu == 'O': raise ValueError('jobu and jobvt cannot both be O') if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldvt = 1 vh_gpu = a_gpu else: ldvt = 1 vh_gpu = gpuarray.empty((), data_type) # Compute SVD and check error status: status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vh_gpu.gpudata), ldvt) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # Since the input is assumed to be transposed, it is necessary to # return the computed matrices in reverse order: if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']: return vh_gpu, s_gpu, u_gpu elif jobu == 'N' and jobvt != 'N': return vh_gpu, s_gpu elif jobu != 'N' and jobvt == 'N': return s_gpu, u_gpu else: return s_gpu
def svd(a_gpu, full_matrices=1, compute_uv=1): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. full_matrices : bool, optional If True (default), `u` and `vh` have the shapes `(m, m)` and `(n, n)`, respectively. Otherwise, the shapes are `(m, k)` and `(k, n)`, resp., where `k = min(m, n)`. compute_uv : bool, optional If True (default), compute `u` and `vh` in addition to `s`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `full_matrices`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `full_matrices`. Notes ----- This function destroys the contents of the input matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 0) >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ # The free version of CULA only supports single precision floating # point numbers: real_dtype = np.dtype(np.float32) if a_gpu.dtype == np.complex64: cula_func = cula._libcula.culaDeviceCgesvd elif a_gpu.dtype == np.float32: cula_func = cula._libcula.culaDeviceSgesvd else: raise ValueError('unsupported type') # Transpose shape because CUDA assumes arrays are stored in # column-major format: (m, n) = a_gpu.shape[::-1] # Set LDA: lda = max(1, m) # Set S: s_gpu = gpuarray.empty(min(m, n), real_dtype) # Set JOBU and JOBVT: if compute_uv: if full_matrices: jobu = 'A' jobvt = 'A' else: jobu = 'S' jobvt = 'S' else: jobu = 'N' jobvt = 'N' # Set LDU and transpose of U: ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), a_gpu.dtype) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), a_gpu.dtype) else: ldu = 1 u_gpu = gpuarray.empty((1, 1), a_gpu.dtype) # Set LDVT and transpose of VT: if jobvt == 'A': ldvt = n vt_gpu = gpuarray.empty((n, n), a_gpu.dtype) elif jobvt == 'S': ldvt = min(m, n) vt_gpu = gpuarray.empty((n, ldvt), a_gpu.dtype) else: ldvt = 1 vt_gpu = gpuarray.empty((1, 1), a_gpu.dtype) # Compute SVD and check error status: status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vt_gpu.gpudata), ldvt) cula.culaCheckStatus(status) if compute_uv: return vt_gpu, s_gpu, u_gpu else: return s_gpu