def culaDeviceCgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex general matrix. """ status = _libcula.culaDeviceCgemv( trans, m, n, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda, int(x), incx, cuda.cuFloatComplex(beta.real, beta.imag), int(y), incy) culaCheckStatus(status)
def culaDeviceCgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex general matrix. """ status = _libcula.culaDeviceCgemm( transa, transb, m, n, k, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda, int(B), ldb, cuda.cuFloatComplex(beta.real, beta.imag), int(C), ldc) culaCheckStatus(status)
def culaDeviceCgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex general matrix. """ status = _libcula.culaDeviceCgemm(transa, transb, m, n, k, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda, int(B), ldb, cuda.cuFloatComplex(beta.real, beta.imag), int(C), ldc) culaCheckStatus(status)
def culaDeviceCgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex general matrix. """ status = _libcula.culaDeviceCgemv(trans, m, n, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda, int(x), incx, cuda.cuFloatComplex(beta.real, beta.imag), int(y), incy) culaCheckStatus(status)
def magma_caxpy(n, alpha, dx, incx, dy, incy): """ Vector addition. """ _libmagma.magma_caxpy(n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(dx), incx, int(dy), incy)
def dot(x_gpu, y_gpu, transa='N', transb='N'): """ Dot product of two arrays. For 1D arrays, this function computes the inner product. For 2D arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix product; the result has shape `(m, n)`. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray Input array. transa : char If 'T', compute the product of the transpose of `x_gpu`. If 'C', compute the product of the Hermitian of `x_gpu`. transb : char If 'T', compute the product of the transpose of `y_gpu`. If 'C', compute the product of the Hermitian of `y_gpu`. Returns ------- c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128} Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D arrays, the result will be returned as a scalar. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> import misc >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = linalg.dot(a_gpu, b_gpu) >>> np.allclose(np.dot(a, b), c_gpu.get()) True >>> d = np.asarray(np.random.rand(5), np.float32) >>> e = np.asarray(np.random.rand(5), np.float32) >>> d_gpu = gpuarray.to_gpu(d) >>> e_gpu = gpuarray.to_gpu(e) >>> f = linalg.dot(d_gpu, e_gpu) >>> np.allclose(np.dot(d, e), f) True """ if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1: if x_gpu.size != y_gpu.size: raise ValueError('arrays must be of same length') # Compute inner product for 1D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas._libcublas.cublasCdotu elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas._libcublas.cublasSdot elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas._libcublas.cublasZdotu elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas._libcublas.cublasDdot else: raise ValueError('unsupported combination of input types') result = cublas_func(x_gpu.size, int(x_gpu.gpudata), 1, int(y_gpu.gpudata), 1) if x_gpu.dtype == np.complex64: return np.float32(result.x) + 1j * np.float32(result.y) elif x_gpu.dtype == np.complex128: return np.float64(result.x) + 1j * np.float64(result.y) elif x_gpu.dtype == np.float32: return np.float32(result) else: return np.float64(result) else: # Perform matrix multiplication for 2D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas._libcublas.cublasCgemm alpha = cuda.cuFloatComplex(1, 0) beta = cuda.cuFloatComplex(0, 0) elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas._libcublas.cublasSgemm alpha = np.float32(1.0) beta = np.float32(0.0) elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas._libcublas.cublasZgemm alpha = cuda.cuDoubleComplex(1, 0) beta = cuda.cuDoubleComplex(0, 0) elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas._libcublas.cublasDgemm alpha = np.float64(1.0) beta = np.float64(0.0) else: raise ValueError('unsupported combination of input types') transa = lower(transa) transb = lower(transb) if transb in ['t', 'c']: m, k = y_gpu.shape elif transb in ['n']: k, m = y_gpu.shape else: raise ValueError('invalid value for transb') if transa in ['t', 'c']: l, n = x_gpu.shape elif transa in ['n']: n, l = x_gpu.shape else: raise ValueError('invalid value for transa') if l != k: raise ValueError('objects are not aligned') if transb == 'n': lda = max(1, m) else: lda = max(1, k) if transa == 'n': ldb = max(1, k) else: ldb = max(1, n) ldc = max(1, m) # Note that the desired shape of the output matrix is the transpose # of what CUBLAS assumes: c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype) cublas_func(transb, transa, m, n, k, alpha, int(y_gpu.gpudata), lda, int(x_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc) status = cublas.cublasGetError() cublas.cublasCheckStatus(status) return c_gpu
def dot(x_gpu, y_gpu, transa='N', transb='N'): """ Dot product of two arrays. For 1D arrays, this function computes the inner product. For 2D arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix product; the result has shape `(m, n)`. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray Input array. transa : char If 'T', compute the product of the transpose of `x_gpu`. If 'C', compute the product of the Hermitian of `x_gpu`. transb : char If 'T', compute the product of the transpose of `y_gpu`. If 'C', compute the product of the Hermitian of `y_gpu`. Returns ------- c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128} Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D arrays, the result will be returned as a scalar. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> import misc >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = linalg.dot(a_gpu, b_gpu) >>> np.allclose(np.dot(a, b), c_gpu.get()) True >>> d = np.asarray(np.random.rand(5), np.float32) >>> e = np.asarray(np.random.rand(5), np.float32) >>> d_gpu = gpuarray.to_gpu(d) >>> e_gpu = gpuarray.to_gpu(e) >>> f = linalg.dot(d_gpu, e_gpu) >>> np.allclose(np.dot(d, e), f) True """ if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1: if x_gpu.size != y_gpu.size: raise ValueError('arrays must be of same length') # Compute inner product for 1D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas._libcublas.cublasCdotu elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas._libcublas.cublasSdot elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas._libcublas.cublasZdotu elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas._libcublas.cublasDdot else: raise ValueError('unsupported combination of input types') result = cublas_func(x_gpu.size, int(x_gpu.gpudata), 1, int(y_gpu.gpudata), 1) if x_gpu.dtype == np.complex64: return np.float32(result.x)+1j*np.float32(result.y) elif x_gpu.dtype == np.complex128: return np.float64(result.x)+1j*np.float64(result.y) elif x_gpu.dtype == np.float32: return np.float32(result) else: return np.float64(result) else: # Perform matrix multiplication for 2D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas._libcublas.cublasCgemm alpha = cuda.cuFloatComplex(1, 0) beta = cuda.cuFloatComplex(0, 0) elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas._libcublas.cublasSgemm alpha = np.float32(1.0) beta = np.float32(0.0) elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas._libcublas.cublasZgemm alpha = cuda.cuDoubleComplex(1, 0) beta = cuda.cuDoubleComplex(0, 0) elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas._libcublas.cublasDgemm alpha = np.float64(1.0) beta = np.float64(0.0) else: raise ValueError('unsupported combination of input types') transa = lower(transa) transb = lower(transb) if transb in ['t', 'c']: m, k = y_gpu.shape elif transb in ['n']: k, m = y_gpu.shape else: raise ValueError('invalid value for transb') if transa in ['t', 'c']: l, n = x_gpu.shape elif transa in ['n']: n, l = x_gpu.shape else: raise ValueError('invalid value for transa') if l != k: raise ValueError('objects are not aligned') if transb == 'n': lda = max(1, m) else: lda = max(1, k) if transa == 'n': ldb = max(1, k) else: ldb = max(1, n) ldc = max(1, m) # Note that the desired shape of the output matrix is the transpose # of what CUBLAS assumes: c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype) cublas_func(transb, transa, m, n, k, alpha, int(y_gpu.gpudata), lda, int(x_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc) status = cublas.cublasGetError() cublas.cublasCheckStatus(status) return c_gpu
def dot(a_gpu, b_gpu): """ Matrix product of two arrays. For 1D arrays, this function computes the inner product. For 2D arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix product; the result has shape `(m, n)`. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array. b_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Dot product of `a_gpu` and `b_gpu`. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = linalg.dot(a_gpu, b_gpu) >>> np.allclose(np.dot(a, b), c_gpu.get()) True >>> d = np.asarray(np.random.rand(5), np.float32) >>> e = np.asarray(np.random.rand(5), np.float32) >>> d_gpu = gpuarray.to_gpu(d) >>> e_gpu = gpuarray.to_gpu(e) >>> f = linalg.dot(d_gpu, e_gpu) >>> np.allclose(np.dot(d, e), f) True >>> p = np.asarray(np.random.rand(4, 2), np.complex64) >>> q = np.asarray(np.random.rand(2, 2), np.complex64) >>> p_gpu = gpuarray.to_gpu(p) >>> q_gpu = gpuarray.to_gpu(q) >>> r_gpu = linalg.dot(p_gpu, q_gpu) >>> np.allclose(np.dot(p, q), r_gpu.get()) True >>> s = np.asarray(np.random.rand(5), np.complex128) >>> t = np.asarray(np.random.rand(5), np.complex128) >>> s_gpu = gpuarray.to_gpu(s) >>> t_gpu = gpuarray.to_gpu(t) >>> u = linalg.dot(s_gpu, t_gpu) >>> np.allclose(np.dot(s, t), u) True """ if len(a_gpu.shape) == 1 and len(b_gpu.shape) == 1: # Compute inner product for 1D arrays: if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64): cublas_func = cublas._libcublas.cublasCdotu elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32): cublas_func = cublas._libcublas.cublasSdot elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128): cublas_func = cublas._libcublas.cublasZdotu elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64): cublas_func = cublas._libcublas.cublasDdot else: raise ValueError('unsupported combination of input types') result = cublas_func(a_gpu.size, int(a_gpu.gpudata), 1, int(b_gpu.gpudata), 1) if a_gpu.dtype == np.complex64: return np.float32(result.x)+1j*np.float32(result.y) elif a_gpu.dtype == np.complex128: return np.float64(result.x)+1j*np.float64(result.y) elif a_gpu.dtype == np.float32: return np.float32(result) else: return np.float64(result) else: # Perform matrix multiplication for 2D arrays: if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64): cublas_func = cublas._libcublas.cublasCgemm alpha = cuda.cuFloatComplex(1, 0) beta = cuda.cuFloatComplex(0, 0) elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32): cublas_func = cublas._libcublas.cublasSgemm alpha = np.float32(1.0) beta = np.float32(0.0) elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128): cublas_func = cublas._libcublas.cublasZgemm alpha = cuda.cuDoubleComplex(1, 0) beta = cuda.cuDoubleComplex(0, 0) elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64): cublas_func = cublas._libcublas.cublasDgemm alpha = np.float64(1.0) beta = np.float64(0.0) else: raise ValueError('unsupported combination of input types') transa = 'N' transb = 'N' m = b_gpu.shape[1] n = a_gpu.shape[0] k = b_gpu.shape[0] lda = m ldb = k ldc = max(1, m) c_gpu = gpuarray.empty((a_gpu.shape[0], b_gpu.shape[1]), a_gpu.dtype) cublas_func(transb, transa, m, n, k, alpha, int(b_gpu.gpudata), lda, int(a_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc) status = cublas.cublasGetError() cublas.cublasCheckStatus(status) return c_gpu