def cuda_qr(): cula.culaInitialize() a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) n = a.shape[0] m = a.shape[1] ida = a.shape[1] tau = np.empty(n, dtype=np.int32) tau_gpu = gpuarray.to_gpu(tau) a_gpu = gpuarray.to_gpu(a) # culaDeviceDgeqrf output = cula.culaDeviceSgeqrf(m, n, a_gpu.gpudata, ida, tau_gpu.gpudata) print a_gpu.get() print tau_gpu.get() cula.culaShutdown()
def make_thunk(self, node, storage_map, _, no_recycling=[]): # Initialize CULA the first time it is needed global cula_initialized if not cula_available: raise RuntimeError('Cula is not available and ' 'GpuSolve Op can not be constructed.') if not cula_initialized: cula.culaInitialize() cula_initialized = True inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] def thunk(): # size of the matrices to invert z = outputs[0] # Matrix A = inputs[0][0] # Solution vectors b = inputs[1][0] # A is not explicitly converted between C and F order, instead we # switch the "transpose" flag if self.trans in ('T', 'C'): trans = 'N' else: trans = 'T' # Convert b to F-order from c-order. b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1])) # This copy forces allocation of a new C-contiguous buffer # and returns it. A_cpy = A.copy() b_cpy = b_cpy.copy() def cula_gpu_solve(A_, b_, trans='T'): A_shape = A_.shape b_shape = b_.shape assert (len(A_shape) == 2) assert (len(b_shape) == 2) if trans in ['T', 'C']: l, n = A_shape k, m = b_shape if n != k: raise ValueError('A and b must be aligned.') elif trans in ['N']: n, l = A_shape k, m = b_shape if l != m: raise ValueError('A and b must be aligned.') else: raise ValueError('Invalid value for trans') lda = max(1, n) ldb = max(1, n, l) # construct pointer arrays needed for culaDeviceSgels # Cula requires you to pass a pointer for A and b. A_ptr = A_.gpudata b_ptr = b_.gpudata cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb) return A_, b_ A_pycuda, b_pycuda = cula_gpu_solve(A_cpy, b_cpy, trans) # Convert b to F-order from c-order and assign it to output: b_cpy = b_cpy.reshape(b.shape[::-1]) b_cpy = dimshuffle(b_cpy, (1, 0)) z[0] = b_cpy thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
def eig_sym(G, compute_z=True, uplo='U'): """ compute Eigenvalue Decompositon of a symmetric or Hermitian matrix G G = V D V^{*} Parameters ------------------------------------- G: PitchArray, GPUArray or numpy.ndarray if G is GPUArray or PitchArray, its gpudata will be destroyed after calling the function compute_z: bool whether return eigenvectors uplo: str 'U' or 'u' assumes the entries of G are stored in upper triangular part, lower off diagonal triangular part is not referenced 'L' or 'l' assumes the entries of G are stored in lower triangular part, upper off diagonal triangular part is not referenced Returns ------------------------------------- D: PitchArray a row vector containing all eigenvalues with ascending order V: PitchArray if compute_z, jth column of V contains orthonormal eigenvector associated with jth eigenvalue Examples D = eig_sym(G, compute_z = False) D,V = eig_sym(G, compute_z = True) """ if cula._libcula_toolkit != 'premium': raise ValueError("eigenvalue decomposition is only supported " "in premium version of CULA") if G.__class__ is not parray.PitchArray: if G.__class__ is garray.GPUArray: h_G = G.get() del G.gpudata A = parray.to_gpu(h_G) elif G.__class__ is np.ndarray: A = parray.to_gpu(G) else: raise TypeError("G must be either parray, or GPUArray or ndarray") else: A = G if len(A.shape) != 2: raise TypeError("eig only works on 2D matrix") if A.shape[0] != A.shape[1]: raise ValueError("G must be square matrix") if uplo in ['u', 'U']: uplo = 'L' elif uplo in ['l', 'L']: uplo = 'U' else: raise ValueError("uplo must be 'U' or 'L'") real_dtype = np.dtype(np.float32) if A.dtype == np.complex64: eig_func = cula.culaDeviceCheev elif A.dtype == np.float32: eig_func = cula.culaDeviceSsyev else: if A.dtype == np.complex128: eig_func = cula.culaDeviceZheev elif A.dtype == np.float64: eig_func = cula.culaDeviceDsyev else: raise ValueError('unsupported type') real_dtype = np.dtype(np.float64) D = parray.empty(A.shape[0], real_dtype) cula.culaInitialize() handle = cublashandle() if compute_z: jobz = 'V' else: jobz = 'N' eig_func(handle.handle, jobz, uplo, A.shape[0], A.gpudata, A.ld, D.gpudata) #cula.culaShutdown() if compute_z: return D, A.conj().T() else: return D
def svd(G, compute_u=True, compute_v=True, econ=False): """ compute Singular Value Decompositon of G G = U*(diag(S))*V Parameters ---------------------------------------- G: PitchArray, GPUArray or numpy.ndarray of shape (m,n) if G is GPUArray or PitchArray, its gpudata will be destroyed after calling the function compute_u: bool whether return U matrix or not compute_v: bool whether return V matrix or not econ: bool return economical matrix Returns: U: parray.PitchArray matrix as U in G = U*(diag(S))*V, if econ, returns the first min(m,n) columns of U S: parray.PitchArray vector a row vector containing all singular values with descending order V: parray.PitchArray matrix as V in G = U*(diag(S))*V, if econ, returns the first min(m,n) rows of V order of output: always obeys the order U,S,V e.g. S = svd(G, compute_u = False, compute_v = False) U,S = svd(G, compute_u = True, compute_v = False) S,V = svd(G, compute_u = False, compute_v = True) U,S,V = svd(G, compute_u = True, compute_v = True) """ if G.__class__ is not parray.PitchArray: if G.__class__ is garray.GPUArray: h_G = G.get() del G.gpudata A = parray.to_gpu(h_G) elif G.__class__ is np.ndarray: A = parray.to_gpu(G) else: raise TypeError("G must be either parray, or GPUArray or ndarray") else: A = G real_dtype = np.dtype(np.float32) if A.dtype == np.complex64: svd_func = cula.culaDeviceCgesvd elif A.dtype == np.float32: svd_func = cula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if A.dtype == np.complex128: svd_func = cula.culaDeviceZgesvd elif A.dtype == np.float64: svd_func = cula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_dtype = np.dtype(np.float64) else: raise TypeError('does not support premium double precision svd') if len(A.shape) != 2: raise TypeError("svd only works on 2D matrix") S = parray.empty(min(A.shape), real_dtype) cula.culaInitialize() if compute_u: if compute_v: if econ: if A.shape[1] <= A.shape[0]: jobu = 'A' jobvt = 'O' V = parray.empty((A.shape[1], A.shape[1]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, V.gpudata, V.ld, 1, 1) #cula.culaShutdown() return A, S, V else: jobu = 'O' jobvt = 'A' U = parray.empty((A.shape[0], A.shape[0]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, U.gpudata, U.ld) #cula.culaShutdown() return U, S, A else: if A.shape[1] <= A.shape[0]: jobu = 'O' jobvt = 'A' U = parray.empty((A.shape[0], A.shape[0]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, U.gpudata, U.ld) #cula.culaShutdown() A.shape = (A.shape[1], A.shape[1]) return U, S, A else: jobu = 'A' jobvt = 'O' V = parray.empty((A.shape[1], A.shape[1]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, V.gpudata, V.ld, 1, 1) A.shape = (A.shape[0], A.shape[0]) #cula.culaShutdown() return A, S, V else: if econ | (A.shape[1] >= A.shape[0]): jobu = 'N' jobvt = 'O' svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, 1, 1) if (A.shape[1] > A.shape[0]): A.shape = (A.shape[0], A.shape[0]) #cula.culaShutdown() return A, S else: jobu = 'N' jobvt = 'A' U = parray.empty((A.shape[0], A.shape[0]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, U.gpudata, U.ld) #cula.culaShutdown() return U, S else: if compute_v: if econ | (A.shape[1] <= A.shape[0]): jobu = 'O' jobvt = 'N' svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, 1, 1) if (A.shape[1] < A.shape[0]): A.shape = (A.shape[1], A.shape[1]) #cula.culaShutdown() return S, A else: jobu = 'A' jobvt = 'N' V = parray.empty((A.shape[1], A.shape[1]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, V.gpudata, V.ld, 1, 1) #cula.culaShutdown() return S, V else: jobu = 'N' jobvt = 'N' svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, 1, 1) #cula.culaShutdown() return S
def make_thunk(self, node, storage_map, _, no_recycling=[]): # Initialize CULA the first time it is needed global cula_initialized if not cula_available: raise RuntimeError('Cula is not available and ' 'GpuSolve Op can not be constructed.') if not cula_initialized: cula.culaInitialize() cula_initialized = True inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] def thunk(): # size of the matrices to invert z = outputs[0] # Matrix A = inputs[0][0] # Solution vectors b = inputs[1][0] # A is not explicitly converted between C and F order, instead we # switch the "transpose" flag if self.trans in ('T', 'C'): trans = 'N' else: trans = 'T' # Convert b to F-order from c-order. b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1])) # This copy forces allocation of a new C-contiguous buffer # and returns it. A_cpy = A.copy() b_cpy = b_cpy.copy() def cula_gpu_solve(A_, b_, trans='T'): A_shape = A_.shape b_shape = b_.shape assert(len(A_shape) == 2) assert(len(b_shape) == 2) if trans in ['T', 'C']: l, n = A_shape k, m = b_shape if n != k: raise ValueError('A and b must be aligned.') elif trans in ['N']: n, l = A_shape k, m = b_shape if l != m: raise ValueError('A and b must be aligned.') else: raise ValueError('Invalid value for trans') lda = max(1, n) ldb = max(1, n, l) # construct pointer arrays needed for culaDeviceSgels # Cula requires you to pass a pointer for A and b. A_ptr = A_.gpudata b_ptr = b_.gpudata cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb) return A_, b_ A_pycuda, b_pycuda = cula_gpu_solve(A_cpy, b_cpy, trans) # Convert b to F-order from c-order and assign it to output: b_cpy = b_cpy.reshape(b.shape[::-1]) b_cpy = dimshuffle(b_cpy, (1, 0)) z[0] = b_cpy thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
def eig_sym(G, compute_z = True, uplo = 'U'): """ compute Eigenvalue Decompositon of a symmetric or Hermitian matrix G G = V D V^{*} Parameters ------------------------------------- G: PitchArray, GPUArray or numpy.ndarray if G is GPUArray or PitchArray, its gpudata will be destroyed after calling the function compute_z: bool whether return eigenvectors uplo: str 'U' or 'u' assumes the entries of G are stored in upper triangular part, lower off diagonal triangular part is not referenced 'L' or 'l' assumes the entries of G are stored in lower triangular part, upper off diagonal triangular part is not referenced Returns ------------------------------------- D: PitchArray a row vector containing all eigenvalues with ascending order V: PitchArray if compute_z, jth column of V contains orthonormal eigenvector associated with jth eigenvalue Examples D = eig_sym(G, compute_z = False) D,V = eig_sym(G, compute_z = True) """ if cula._libcula_toolkit != 'premium': raise ValueError("eigenvalue decomposition is only supported " "in premium version of CULA") if G.__class__ is not parray.PitchArray: if G.__class__ is garray.GPUArray: h_G = G.get() del G.gpudata A= parray.to_gpu(h_G) elif G.__class__ is np.ndarray: A = parray.to_gpu(G) else: raise TypeError("G must be either parray, or GPUArray or ndarray") else: A = G if len(A.shape) != 2: raise TypeError("eig only works on 2D matrix") if A.shape[0] != A.shape[1]: raise ValueError("G must be square matrix") if uplo in ['u', 'U']: uplo = 'L' elif uplo in ['l', 'L']: uplo = 'U' else: raise ValueError("uplo must be 'U' or 'L'") real_dtype = np.dtype(np.float32) if A.dtype == np.complex64: eig_func = cula.culaDeviceCheev elif A.dtype == np.float32: eig_func = cula.culaDeviceSsyev else: if A.dtype == np.complex128: eig_func = cula.culaDeviceZheev elif A.dtype == np.float64: eig_func = cula.culaDeviceDsyev else: raise ValueError('unsupported type') real_dtype = np.dtype(np.float64) D = parray.empty(A.shape[0], real_dtype) cula.culaInitialize() handle = cublashandle() if compute_z: jobz = 'V' else: jobz = 'N' eig_func(handle.handle, jobz, uplo, A.shape[0], A.gpudata, A.ld, D.gpudata) #cula.culaShutdown() if compute_z: return D, A.conj().T() else: return D
def svd(G, compute_u = True, compute_v = True, econ = False): """ compute Singular Value Decompositon of G G = U*(diag(S))*V Parameters ---------------------------------------- G: PitchArray, GPUArray or numpy.ndarray of shape (m,n) if G is GPUArray or PitchArray, its gpudata will be destroyed after calling the function compute_u: bool whether return U matrix or not compute_v: bool whether return V matrix or not econ: bool return economical matrix Returns: U: parray.PitchArray matrix as U in G = U*(diag(S))*V, if econ, returns the first min(m,n) columns of U S: parray.PitchArray vector a row vector containing all singular values with descending order V: parray.PitchArray matrix as V in G = U*(diag(S))*V, if econ, returns the first min(m,n) rows of V order of output: always obeys the order U,S,V e.g. S = svd(G, compute_u = False, compute_v = False) U,S = svd(G, compute_u = True, compute_v = False) S,V = svd(G, compute_u = False, compute_v = True) U,S,V = svd(G, compute_u = True, compute_v = True) """ if G.__class__ is not parray.PitchArray: if G.__class__ is garray.GPUArray: h_G = G.get() del G.gpudata A= parray.to_gpu(h_G) elif G.__class__ is np.ndarray: A = parray.to_gpu(G) else: raise TypeError("G must be either parray, or GPUArray or ndarray") else: A = G real_dtype = np.dtype(np.float32) if A.dtype == np.complex64: svd_func = cula.culaDeviceCgesvd elif A.dtype == np.float32: svd_func = cula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if A.dtype == np.complex128: svd_func = cula.culaDeviceZgesvd elif A.dtype == np.float64: svd_func = cula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_dtype = np.dtype(np.float64) else: raise TypeError('does not support premium double precision svd') if len(A.shape) != 2: raise TypeError("svd only works on 2D matrix") S = parray.empty(min(A.shape), real_dtype) cula.culaInitialize() if compute_u: if compute_v: if econ: if A.shape[1] <= A.shape[0]: jobu = 'A' jobvt = 'O' V = parray.empty((A.shape[1], A.shape[1]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, V.gpudata, V.ld, 1, 1) #cula.culaShutdown() return A,S,V else: jobu = 'O' jobvt = 'A' U = parray.empty((A.shape[0], A.shape[0]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, U.gpudata, U.ld) #cula.culaShutdown() return U,S,A else: if A.shape[1] <= A.shape[0]: jobu = 'O' jobvt = 'A' U = parray.empty((A.shape[0], A.shape[0]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, U.gpudata, U.ld) #cula.culaShutdown() A.shape = (A.shape[1],A.shape[1]) return U,S,A else: jobu = 'A' jobvt = 'O' V = parray.empty((A.shape[1], A.shape[1]), A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, V.gpudata, V.ld, 1, 1) A.shape = (A.shape[0], A.shape[0]) #cula.culaShutdown() return A,S,V else: if econ | (A.shape[1] >= A.shape[0]): jobu = 'N' jobvt = 'O' svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, 1, 1) if (A.shape[1] > A.shape[0]): A.shape = (A.shape[0], A.shape[0]) #cula.culaShutdown() return A,S else: jobu = 'N' jobvt = 'A' U = parray.empty((A.shape[0],A.shape[0]),A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, U.gpudata, U.ld) #cula.culaShutdown() return U,S else: if compute_v: if econ | (A.shape[1] <= A.shape[0]): jobu = 'O' jobvt = 'N' svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, 1, 1) if (A.shape[1] < A.shape[0]): A.shape = (A.shape[1], A.shape[1]) #cula.culaShutdown() return S,A else: jobu = 'A' jobvt = 'N' V = parray.empty((A.shape[1],A.shape[1]),A.dtype) svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, V.gpudata, V.ld, 1, 1) #cula.culaShutdown() return S,V else: jobu = 'N' jobvt = 'N' svd_func(jobu, jobvt, A.shape[1], A.shape[0], A.gpudata, A.ld, S.gpudata, 1, 1, 1, 1) #cula.culaShutdown() return S