def svd(a_gpu, jobu='A', jobvt='A'): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. jobu : {'A', 'S', 'O', 'N'} If 'A', return the full `u` matrix with shape `(m, m)`. If 'S', return the `u` matrix with shape `(m, k)`. If 'O', return the `u` matrix with shape `(m, k) without allocating a new matrix. If 'N', don't return `u`. jobvt : {'A', 'S', 'O', 'N'} If 'A', return the full `vh` matrix with shape `(n, n)`. If 'S', return the `vh` matrix with shape `(k, n)`. If 'O', return the `vh` matrix with shape `(k, n) without allocating a new matrix. If 'N', don't return `vh`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `jobu`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `jobvt`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix regardless of the values of `jobu` and `jobvt`. Only one of `jobu` or `jobvt` may be set to `O`, and then only for a square matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S') >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ if not _has_cula: raise NotImplementError('CULA not installed') # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCgesvd elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZgesvd elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) # Since the input matrix is transposed, jobu and jobvt must also # be switched because the computed matrices will be returned in # reversed order: jobvt, jobu = jobu, jobvt # Set the leading dimension of the input matrix: lda = max(1, m) # Allocate the array of singular values: s_gpu = gpuarray.empty(min(m, n), real_type) # Set the leading dimension and allocate u: jobu = upper(jobu) jobvt = upper(jobvt) ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), data_type) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), data_type) elif jobu == 'O': if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldu = 1 u_gpu = a_gpu else: ldu = 1 u_gpu = gpuarray.empty((), data_type) # Set the leading dimension and allocate vh: if jobvt == 'A': ldvt = n vh_gpu = gpuarray.empty((n, n), data_type) elif jobvt == 'S': ldvt = min(m, n) vh_gpu = gpuarray.empty((n, ldvt), data_type) elif jobvt == 'O': if jobu == 'O': raise ValueError('jobu and jobvt cannot both be O') if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldvt = 1 vh_gpu = a_gpu else: ldvt = 1 vh_gpu = gpuarray.empty((), data_type) # Compute SVD and check error status: status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vh_gpu.gpudata), ldvt) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # Since the input is assumed to be transposed, it is necessary to # return the computed matrices in reverse order: if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']: return vh_gpu, s_gpu, u_gpu elif jobu == 'N' and jobvt != 'N': return vh_gpu, s_gpu elif jobu != 'N' and jobvt == 'N': return s_gpu, u_gpu else: return s_gpu
def cho_factor(a_gpu, uplo='L'): """ Cholesky factorisation Performs an in-place cholesky factorisation on the matrix 'a' such that a = x*x.T or x.T*x, if the lower='L' or upper='U' triangle of 'a' is used, respectively. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. uplo: use the upper='U' or lower='L' (default) triangle of 'a' Returns ------- a: Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> cho_factor(a_gpu) """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCpotrf elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSpotrf if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZpotrf elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDpotrf else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) if (n!=m): raise ValueError('Matrix must be symmetric positive-definite') # Set the leading dimension of the input matrix: lda = max(1, m) status = cula_func(uplo, n, int(a_gpu.gpudata), lda) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers()
def cho_solve(a_gpu, b_gpu, uplo='L'): """ Cholesky solver Solve a system of equations via cholesky factorisation, i.e. a*x = b. Overwrites 'b' to give 'inv(a)*b', and overwrites the chosen triangle of 'a' with factorised triangle Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. b : pycuda.gpuarray.GPUArray Input matrix of shape `(m, 1)` to decompose. uplo: use the upper='U' or lower='L' (default) triangle of 'a' Returns ------- a: Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> b = np.array([11.,19.]) >>> b = np.asarray(b, np.float64) >>> b_gpu = gpuarray.to_gpu(b) >>> cho_solve(a_gpu,b_gpu) """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCpotrf elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSpotrf if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZpotrf elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDpotrf else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: na, ma = a_gpu.shape square = (na == ma) if (na!=ma): raise ValueError('Matrix must be symmetric positive-definite') # Set the leading dimension of the input matrix: lda = max(1, ma) ldb = lda # Assuming we are only solving for a vector. Hence, nrhs = 1 status = cula_func(uplo, na, 1, int(a_gpu.gpudata), lda, int(b_gpu.gpudata), ldb) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers()
def svd(a_gpu, jobu='A', jobvt='A'): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. jobu : {'A', 'S', 'O', 'N'} If 'A', return the full `u` matrix with shape `(m, m)`. If 'S', return the `u` matrix with shape `(m, k)`. If 'O', return the `u` matrix with shape `(m, k) without allocating a new matrix. If 'N', don't return `u`. jobvt : {'A', 'S', 'O', 'N'} If 'A', return the full `vh` matrix with shape `(n, n)`. If 'S', return the `vh` matrix with shape `(k, n)`. If 'O', return the `vh` matrix with shape `(k, n) without allocating a new matrix. If 'N', don't return `vh`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `jobu`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `jobvt`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix regardless of the values of `jobu` and `jobvt`. Only one of `jobu` or `jobvt` may be set to `O`, and then only for a square matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S') >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ if not _has_cula: raise NotImplementError('CULA not installed') # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCgesvd elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZgesvd elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) # Since the input matrix is transposed, jobu and jobvt must also # be switched because the computed matrices will be returned in # reversed order: jobvt, jobu = jobu, jobvt # Set the leading dimension of the input matrix: lda = max(1, m) # Allocate the array of singular values: s_gpu = gpuarray.empty(min(m, n), real_type) # Set the leading dimension and allocate u: jobu = upper(jobu) jobvt = upper(jobvt) ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), data_type) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), data_type) elif jobu == 'O': if not square: raise ValueError('in-place computation of singular vectors ' + 'of non-square matrix not allowed') ldu = 1 u_gpu = a_gpu else: ldu = 1 u_gpu = gpuarray.empty((), data_type) # Set the leading dimension and allocate vh: if jobvt == 'A': ldvt = n vh_gpu = gpuarray.empty((n, n), data_type) elif jobvt == 'S': ldvt = min(m, n) vh_gpu = gpuarray.empty((n, ldvt), data_type) elif jobvt == 'O': if jobu == 'O': raise ValueError('jobu and jobvt cannot both be O') if not square: raise ValueError('in-place computation of singular vectors ' + 'of non-square matrix not allowed') ldvt = 1 vh_gpu = a_gpu else: ldvt = 1 vh_gpu = gpuarray.empty((), data_type) # Compute SVD and check error status: status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vh_gpu.gpudata), ldvt) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # Since the input is assumed to be transposed, it is necessary to # return the computed matrices in reverse order: if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']: return vh_gpu, s_gpu, u_gpu elif jobu == 'N' and jobvt != 'N': return vh_gpu, s_gpu elif jobu != 'N' and jobvt == 'N': return s_gpu, u_gpu else: return s_gpu