예제 #1
0
def culaDeviceCgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy):
    """
    Matrix-vector product for complex general matrix.

    """

    status = _libcula.culaDeviceCgemv(
        trans, m, n, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda,
        int(x), incx, cuda.cuFloatComplex(beta.real, beta.imag), int(y), incy)
    culaCheckStatus(status)
예제 #2
0
def culaDeviceCgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C,
                    ldc):
    """
    Matrix-matrix product for complex general matrix.

    """

    status = _libcula.culaDeviceCgemm(
        transa, transb, m, n, k, cuda.cuFloatComplex(alpha.real, alpha.imag),
        int(A), lda, int(B), ldb, cuda.cuFloatComplex(beta.real, beta.imag),
        int(C), ldc)
    culaCheckStatus(status)
예제 #3
0
def culaDeviceCgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc):
    """
    Matrix-matrix product for complex general matrix.

    """
    
    status = _libcula.culaDeviceCgemm(transa, transb, m, n, k,
                                      cuda.cuFloatComplex(alpha.real,
                                                        alpha.imag),
                                      int(A), lda, int(B), ldb,
                                      cuda.cuFloatComplex(beta.real,
                                                        beta.imag),
                                      int(C), ldc)
    culaCheckStatus(status)
예제 #4
0
def culaDeviceCgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy):
    """
    Matrix-vector product for complex general matrix.

    """
    
    status = _libcula.culaDeviceCgemv(trans, m, n,
                           cuda.cuFloatComplex(alpha.real,
                                               alpha.imag),
                           int(A), lda, int(x), incx,
                           cuda.cuFloatComplex(beta.real,
                                               beta.imag),
                           int(y), incy)
    culaCheckStatus(status)
예제 #5
0
def magma_caxpy(n, alpha, dx, incx, dy, incy):
    """
    Vector addition.
    """

    _libmagma.magma_caxpy(n, ctypes.byref(cuda.cuFloatComplex(alpha.real,
                                                              alpha.imag)), 
                          int(dx), incx, int(dy), incy)
예제 #6
0
def dot(x_gpu, y_gpu, transa='N', transb='N'):
    """
    Dot product of two arrays.

    For 1D arrays, this function computes the inner product. For 2D
    arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix
    product; the result has shape `(m, n)`.

    Parameters
    ----------
    x_gpu : pycuda.gpuarray.GPUArray
        Input array.
    y_gpu : pycuda.gpuarray.GPUArray
        Input array.
    transa : char
        If 'T', compute the product of the transpose of `x_gpu`.
        If 'C', compute the product of the Hermitian of `x_gpu`.
    transb : char
        If 'T', compute the product of the transpose of `y_gpu`.
        If 'C', compute the product of the Hermitian of `y_gpu`.

    Returns
    -------
    c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128}
        Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D
        arrays, the result will be returned as a scalar.
    
    Notes
    -----
    The input matrices must all contain elements of the same data type.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> import misc
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(4, 2), np.float32)
    >>> b = np.asarray(np.random.rand(2, 2), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> c_gpu = linalg.dot(a_gpu, b_gpu)
    >>> np.allclose(np.dot(a, b), c_gpu.get())
    True
    >>> d = np.asarray(np.random.rand(5), np.float32)
    >>> e = np.asarray(np.random.rand(5), np.float32)
    >>> d_gpu = gpuarray.to_gpu(d)
    >>> e_gpu = gpuarray.to_gpu(e)
    >>> f = linalg.dot(d_gpu, e_gpu)
    >>> np.allclose(np.dot(d, e), f)
    True
    
    """

    if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1:

        if x_gpu.size != y_gpu.size:
            raise ValueError('arrays must be of same length')

        # Compute inner product for 1D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCdotu
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSdot
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZdotu
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDdot
        else:
            raise ValueError('unsupported combination of input types')

        result = cublas_func(x_gpu.size, int(x_gpu.gpudata), 1,
                             int(y_gpu.gpudata), 1)

        if x_gpu.dtype == np.complex64:
            return np.float32(result.x) + 1j * np.float32(result.y)
        elif x_gpu.dtype == np.complex128:
            return np.float64(result.x) + 1j * np.float64(result.y)
        elif x_gpu.dtype == np.float32:
            return np.float32(result)
        else:
            return np.float64(result)
    else:

        # Perform matrix multiplication for 2D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCgemm
            alpha = cuda.cuFloatComplex(1, 0)
            beta = cuda.cuFloatComplex(0, 0)
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSgemm
            alpha = np.float32(1.0)
            beta = np.float32(0.0)
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZgemm
            alpha = cuda.cuDoubleComplex(1, 0)
            beta = cuda.cuDoubleComplex(0, 0)
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDgemm
            alpha = np.float64(1.0)
            beta = np.float64(0.0)
        else:
            raise ValueError('unsupported combination of input types')

        transa = lower(transa)
        transb = lower(transb)

        if transb in ['t', 'c']:
            m, k = y_gpu.shape
        elif transb in ['n']:
            k, m = y_gpu.shape
        else:
            raise ValueError('invalid value for transb')

        if transa in ['t', 'c']:
            l, n = x_gpu.shape
        elif transa in ['n']:
            n, l = x_gpu.shape
        else:
            raise ValueError('invalid value for transa')

        if l != k:
            raise ValueError('objects are not aligned')

        if transb == 'n':
            lda = max(1, m)
        else:
            lda = max(1, k)

        if transa == 'n':
            ldb = max(1, k)
        else:
            ldb = max(1, n)

        ldc = max(1, m)

        # Note that the desired shape of the output matrix is the transpose
        # of what CUBLAS assumes:
        c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype)
        cublas_func(transb, transa, m, n, k, alpha, int(y_gpu.gpudata), lda,
                    int(x_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc)

        status = cublas.cublasGetError()
        cublas.cublasCheckStatus(status)

        return c_gpu
예제 #7
0
def dot(x_gpu, y_gpu, transa='N', transb='N'):
    """
    Dot product of two arrays.

    For 1D arrays, this function computes the inner product. For 2D
    arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix
    product; the result has shape `(m, n)`.

    Parameters
    ----------
    x_gpu : pycuda.gpuarray.GPUArray
        Input array.
    y_gpu : pycuda.gpuarray.GPUArray
        Input array.
    transa : char
        If 'T', compute the product of the transpose of `x_gpu`.
        If 'C', compute the product of the Hermitian of `x_gpu`.
    transb : char
        If 'T', compute the product of the transpose of `y_gpu`.
        If 'C', compute the product of the Hermitian of `y_gpu`.

    Returns
    -------
    c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128}
        Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D
        arrays, the result will be returned as a scalar.
    
    Notes
    -----
    The input matrices must all contain elements of the same data type.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> import misc
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(4, 2), np.float32)
    >>> b = np.asarray(np.random.rand(2, 2), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> c_gpu = linalg.dot(a_gpu, b_gpu)
    >>> np.allclose(np.dot(a, b), c_gpu.get())
    True
    >>> d = np.asarray(np.random.rand(5), np.float32)
    >>> e = np.asarray(np.random.rand(5), np.float32)
    >>> d_gpu = gpuarray.to_gpu(d)
    >>> e_gpu = gpuarray.to_gpu(e)
    >>> f = linalg.dot(d_gpu, e_gpu)
    >>> np.allclose(np.dot(d, e), f)
    True
    
    """

    if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1:

        if x_gpu.size != y_gpu.size:
            raise ValueError('arrays must be of same length')
        
        # Compute inner product for 1D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCdotu
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSdot
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZdotu
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDdot
        else:
            raise ValueError('unsupported combination of input types')

        result = cublas_func(x_gpu.size, int(x_gpu.gpudata), 1,
                             int(y_gpu.gpudata), 1)

        if x_gpu.dtype == np.complex64:
            return np.float32(result.x)+1j*np.float32(result.y)
        elif x_gpu.dtype == np.complex128:
            return np.float64(result.x)+1j*np.float64(result.y)
        elif x_gpu.dtype == np.float32:
            return np.float32(result)
        else:
            return np.float64(result)
    else:

        # Perform matrix multiplication for 2D arrays:
        if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCgemm        
            alpha = cuda.cuFloatComplex(1, 0)
            beta = cuda.cuFloatComplex(0, 0)
        elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSgemm
            alpha = np.float32(1.0)
            beta = np.float32(0.0)
        elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZgemm        
            alpha = cuda.cuDoubleComplex(1, 0)
            beta = cuda.cuDoubleComplex(0, 0)
        elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDgemm
            alpha = np.float64(1.0)
            beta = np.float64(0.0)
        else:
            raise ValueError('unsupported combination of input types')

        transa = lower(transa)
        transb = lower(transb)        

        if transb in ['t', 'c']:
            m, k = y_gpu.shape
        elif transb in ['n']:
            k, m = y_gpu.shape
        else:
            raise ValueError('invalid value for transb')

        if transa in ['t', 'c']:
            l, n = x_gpu.shape
        elif transa in ['n']:
            n, l = x_gpu.shape
        else:
            raise ValueError('invalid value for transa')

        if l != k:
            raise ValueError('objects are not aligned')
        
        if transb == 'n':
            lda = max(1, m)
        else:
            lda = max(1, k)
            
        if transa == 'n':
            ldb = max(1, k)
        else:
            ldb = max(1, n)

        ldc = max(1, m)

        # Note that the desired shape of the output matrix is the transpose
        # of what CUBLAS assumes:
        c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype)
        cublas_func(transb, transa, m, n, k, alpha, int(y_gpu.gpudata),
                    lda, int(x_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc)

        status = cublas.cublasGetError()
        cublas.cublasCheckStatus(status)

        return c_gpu
예제 #8
0
def dot(a_gpu, b_gpu):
    """
    Matrix product of two arrays.

    For 1D arrays, this function computes the inner product. For 2D
    arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix
    product; the result has shape `(m, n)`.

    Parameters
    ----------
    a_gpu : pycuda.gpuarray.GPUArray
        Input array.
    b_gpu : pycuda.gpuarray.GPUArray
        Input array.
        
    Returns
    -------
    c_gpu : pycuda.gpuarray.GPUArray
        Dot product of `a_gpu` and `b_gpu`.
    
    Notes
    -----
    The input matrices must all contain elements of the same data type.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(4, 2), np.float32)
    >>> b = np.asarray(np.random.rand(2, 2), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> c_gpu = linalg.dot(a_gpu, b_gpu)
    >>> np.allclose(np.dot(a, b), c_gpu.get())
    True
    >>> d = np.asarray(np.random.rand(5), np.float32)
    >>> e = np.asarray(np.random.rand(5), np.float32)
    >>> d_gpu = gpuarray.to_gpu(d)
    >>> e_gpu = gpuarray.to_gpu(e)
    >>> f = linalg.dot(d_gpu, e_gpu)
    >>> np.allclose(np.dot(d, e), f)
    True
    >>> p = np.asarray(np.random.rand(4, 2), np.complex64)
    >>> q = np.asarray(np.random.rand(2, 2), np.complex64)
    >>> p_gpu = gpuarray.to_gpu(p)
    >>> q_gpu = gpuarray.to_gpu(q)
    >>> r_gpu = linalg.dot(p_gpu, q_gpu)
    >>> np.allclose(np.dot(p, q), r_gpu.get())
    True
    >>> s = np.asarray(np.random.rand(5), np.complex128)
    >>> t = np.asarray(np.random.rand(5), np.complex128)
    >>> s_gpu = gpuarray.to_gpu(s)
    >>> t_gpu = gpuarray.to_gpu(t)
    >>> u = linalg.dot(s_gpu, t_gpu)
    >>> np.allclose(np.dot(s, t), u)
    True
    
    """

    if len(a_gpu.shape) == 1 and len(b_gpu.shape) == 1:

        # Compute inner product for 1D arrays:
        if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCdotu
        elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSdot
        elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZdotu
        elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDdot
        else:
            raise ValueError('unsupported combination of input types')

        result = cublas_func(a_gpu.size, int(a_gpu.gpudata), 1,
                             int(b_gpu.gpudata), 1)

        if a_gpu.dtype == np.complex64:
            return np.float32(result.x)+1j*np.float32(result.y)
        elif a_gpu.dtype == np.complex128:
            return np.float64(result.x)+1j*np.float64(result.y)
        elif a_gpu.dtype == np.float32:
            return np.float32(result)
        else:
            return np.float64(result)
    else:

        # Perform matrix multiplication for 2D arrays:
        if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64):
            cublas_func = cublas._libcublas.cublasCgemm        
            alpha = cuda.cuFloatComplex(1, 0)
            beta = cuda.cuFloatComplex(0, 0)
        elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32):
            cublas_func = cublas._libcublas.cublasSgemm
            alpha = np.float32(1.0)
            beta = np.float32(0.0)
        elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128):
            cublas_func = cublas._libcublas.cublasZgemm        
            alpha = cuda.cuDoubleComplex(1, 0)
            beta = cuda.cuDoubleComplex(0, 0)
        elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64):
            cublas_func = cublas._libcublas.cublasDgemm
            alpha = np.float64(1.0)
            beta = np.float64(0.0)
        else:
            raise ValueError('unsupported combination of input types')

        transa = 'N'
        transb = 'N'
        m = b_gpu.shape[1]
        n = a_gpu.shape[0]
        k = b_gpu.shape[0]
        lda = m
        ldb = k
        ldc = max(1, m)

        c_gpu = gpuarray.empty((a_gpu.shape[0], b_gpu.shape[1]), a_gpu.dtype)
        cublas_func(transb, transa, m, n, k, alpha, int(b_gpu.gpudata),
                    lda, int(a_gpu.gpudata), ldb, beta, int(c_gpu.gpudata), ldc)

        status = cublas.cublasGetError()
        cublas.cublasCheckStatus(status)

        return c_gpu