Пример #1
0
def dotc(x, y, out=None):
    """Computes the dot product of x.conj() and y."""
    dtype = x.dtype.char
    if dtype in 'fd':
        return dot(x, y, out=out)
    elif dtype == 'F':
        func = cublas.cdotc
    elif dtype == 'D':
        func = cublas.zdotc
    else:
        raise TypeError('invalid dtype')
    _check_two_vectors(x, y)

    handle = device.get_cublas_handle()
    result_dtype = dtype
    result_ptr, result, orig_mode = _setup_result_ptr(
        handle, out, result_dtype)
    try:
        func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr)
    finally:
        cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        _core.elementwise_copy(result, out)
    return out
Пример #2
0
def nrm2(x, out=None):
    """Computes the Euclidean norm of vector x."""
    if x.ndim != 1:
        raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim))

    dtype = x.dtype.char
    if dtype == 'f':
        func = cublas.snrm2
    elif dtype == 'd':
        func = cublas.dnrm2
    elif dtype == 'F':
        func = cublas.scnrm2
    elif dtype == 'D':
        func = cublas.dznrm2
    else:
        raise TypeError('invalid dtype')

    handle = device.get_cublas_handle()
    result_dtype = dtype.lower()
    result_ptr, result, orig_mode = _setup_result_ptr(
        handle, out, result_dtype)
    try:
        func(handle, x.size, x.data.ptr, 1, result_ptr)
    finally:
        cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        _core.elementwise_copy(result, out)
    return out
Пример #3
0
def _iamaxmin(x, out, name):
    if x.ndim != 1:
        raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim))

    dtype = x.dtype.char
    if dtype == 'f':
        t = 's'
    elif dtype == 'd':
        t = 'd'
    elif dtype == 'F':
        t = 'c'
    elif dtype == 'D':
        t = 'z'
    else:
        raise TypeError('invalid dtype')
    func = getattr(cublas, 'i' + t + name)

    handle = device.get_cublas_handle()
    result_dtype = 'i'
    result_ptr, result, orig_mode = _setup_result_ptr(
        handle, out, result_dtype)
    try:
        func(handle, x.size, x.data.ptr, 1, result_ptr)
    finally:
        cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        _core.elementwise_copy(result, out)
    return out
Пример #4
0
def axpy(a, x, y):
    """Computes y += a * x.

    (*) y will be updated.
    """
    _check_two_vectors(x, y)

    dtype = x.dtype.char
    if dtype == 'f':
        func = cublas.saxpy
    elif dtype == 'd':
        func = cublas.daxpy
    elif dtype == 'F':
        func = cublas.caxpy
    elif dtype == 'D':
        func = cublas.zaxpy
    else:
        raise TypeError('invalid dtype')

    handle = device.get_cublas_handle()
    a, a_ptr, orig_mode = _setup_scalar_ptr(handle, a, dtype)
    try:
        func(handle, x.size, a_ptr, x.data.ptr, 1, y.data.ptr, 1)
    finally:
        cublas.setPointerMode(handle, orig_mode)
Пример #5
0
def gerc(alpha, x, y, a):
    """Computes a += alpha * x @ y.T.conj()

    Note: ''a'' will be updated.
    """
    dtype = a.dtype.char
    if dtype in 'fd':
        return ger(alpha, x, y, a)
    elif dtype == 'F':
        func = cublas.cgerc
    elif dtype == 'D':
        func = cublas.zgerc
    else:
        raise TypeError('invalid dtype')
    assert a.ndim == 2
    assert x.ndim == y.ndim == 1
    assert a.dtype == x.dtype == y.dtype
    m, n = a.shape
    assert x.shape[0] == m
    assert y.shape[0] == n

    handle = device.get_cublas_handle()
    alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype)
    x_ptr, y_ptr = x.data.ptr, y.data.ptr
    try:
        if a._f_contiguous:
            func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m)
        else:
            aa = a.copy(order='F')
            func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m)
            _core.elementwise_copy(aa, a)
    finally:
        cublas.setPointerMode(handle, orig_mode)
Пример #6
0
def dot(x, y, out=None):
    """Computes the dot product of x and y."""
    dtype = x.dtype.char
    if dtype == 'f':
        func = cublas.sdot
    elif dtype == 'd':
        func = cublas.ddot
    elif dtype in 'FD':
        raise TypeError('Use dotu() or dotc() for complex dtype')
    else:
        raise TypeError('invalid dtype')
    _check_two_vectors(x, y)

    handle = device.get_cublas_handle()
    result_dtype = dtype
    result_ptr, result, orig_mode = _setup_result_ptr(handle, out,
                                                      result_dtype)
    func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr)
    cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        out[...] = result
    return out
Пример #7
0
def asum(x, out=None):
    """Computes the sum of the absolute of x."""
    if x.ndim != 1:
        raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim))

    dtype = x.dtype.char
    if dtype == 'f':
        func = cublas.sasum
    elif dtype == 'd':
        func = cublas.dasum
    elif dtype == 'F':
        func = cublas.scasum
    elif dtype == 'D':
        func = cublas.dzasum
    else:
        raise TypeError('invalid dtype')

    handle = device.get_cublas_handle()
    result_dtype = dtype.lower()
    result_ptr, result, orig_mode = _setup_result_ptr(handle, out,
                                                      result_dtype)
    func(handle, x.size, x.data.ptr, 1, result_ptr)
    cublas.setPointerMode(handle, orig_mode)

    if out is None:
        out = result
    elif out.dtype != result_dtype:
        out[...] = result
    return out
Пример #8
0
def scal(a, x):
    """Computes x *= a.

    (*) x will be updated.
    """
    if x.ndim != 1:
        raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim))

    dtype = x.dtype.char
    if dtype == 'f':
        func = cublas.sscal
    elif dtype == 'd':
        func = cublas.dscal
    elif dtype == 'F':
        func = cublas.cscal
    elif dtype == 'D':
        func = cublas.zscal
    else:
        raise TypeError('invalid dtype')

    handle = device.get_cublas_handle()
    a, a_ptr, orig_mode = _setup_scalar_ptr(handle, a, dtype)
    try:
        func(handle, x.size, a_ptr, x.data.ptr, 1)
    finally:
        cublas.setPointerMode(handle, orig_mode)
Пример #9
0
def _setup_scalar_ptr(handle, a, dtype):
    a, a_ptr = _get_scalar_ptr(a, dtype)
    mode = cublas.getPointerMode(handle)
    if isinstance(a, cupy.ndarray):
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)
    return a, a_ptr, mode
Пример #10
0
 def _norm(self, i):
     _cublas.setPointerMode(self.cublas_handle,
                            _cublas.CUBLAS_POINTER_MODE_DEVICE)
     try:
         self.nrm2(self.cublas_handle, self.n, self.u.data.ptr, 1,
                   self.beta.data.ptr + i * self.beta.itemsize)
     finally:
         _cublas.setPointerMode(self.cublas_handle,
                                self.cublas_pointer_mode)
Пример #11
0
def _setup_scalar_ptr(handle, a, dtype):
    mode = cublas.getPointerMode(handle)
    if isinstance(a, cupy.ndarray):
        if a.dtype != dtype:
            a = cupy.array(a, dtype=dtype)
        a_ptr = a.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        if not (isinstance(a, numpy.ndarray) and a.dtype == dtype):
            a = numpy.array(a, dtype=dtype)
        a_ptr = a.ctypes.data
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)
    return a_ptr, mode
Пример #12
0
def _setup_result_ptr(handle, out, dtype):
    mode = cublas.getPointerMode(handle)
    if out is None or isinstance(out, cupy.ndarray):
        if out is None or out.dtype != dtype:
            result = cupy.empty([], dtype=dtype)
        else:
            result = out
        result_ptr = result.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    elif isinstance(out, numpy.ndarray):
        if out.dtype != dtype:
            result = numpy.empty([], dtype=dtype)
        else:
            result = out
        result_ptr = result.ctypes.data
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)
    else:
        raise TypeError('out must be either cupy or numpy ndarray')
    return result_ptr, result, mode
Пример #13
0
def ger(alpha, x, y, a):
    """Computes a += alpha * x @ y.T

    Note: ''a'' will be updated.
    """
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.sger
    elif dtype == 'd':
        func = cublas.dger
    elif dtype in 'FD':
        raise TypeError('Use geru or gerc for complex dtypes')
    else:
        raise TypeError('invalid dtype')

    assert a.ndim == 2
    assert x.ndim == y.ndim == 1
    assert a.dtype == x.dtype == y.dtype
    m, n = a.shape
    assert x.shape[0] == m
    assert y.shape[0] == n

    handle = device.get_cublas_handle()
    alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype)
    x_ptr, y_ptr = x.data.ptr, y.data.ptr
    try:
        if a._f_contiguous:
            func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m)
        elif a._c_contiguous:
            func(handle, n, m, alpha_ptr, y_ptr, 1, x_ptr, 1, a.data.ptr, n)
        else:
            aa = a.copy(order='F')
            func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m)
            a[...] = aa
    finally:
        cublas.setPointerMode(handle, orig_mode)
Пример #14
0
def sbmv(k, alpha, a, x, beta, y, lower=False):
    """Computes y = alpha*A @ x + beta * y

    """
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.ssbmv
    elif dtype == 'd':
        func = cublas.dsbmv
    else:
        raise TypeError('Complex dtypes not supported')

    assert a.ndim == 2
    assert x.ndim == y.ndim == 1
    assert a.dtype == x.dtype == y.dtype
    m, n = a.shape
    assert x.shape[0] == n
    assert y.shape[0] == n

    if not a._f_contiguous:
        a = a.copy(order='F')

    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    if lower:
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    handle = device.get_cublas_handle()
    try:
        func(handle, uplo, n, k,
             alpha_ptr, a.data.ptr, m, x.data.ptr, 1,
             beta_ptr, y.data.ptr, 1)
    finally:
        cublas.setPointerMode(handle, orig_mode)

    return y
Пример #15
0
    def aux(A, V, u, alpha, beta, i_start, i_end):
        assert A is outer_A
        beta_eps = inversion_eps(A.dtype)

        # Get ready for spmv if enabled
        if cusparse_handle is not None:
            # Note: I would like to reuse descriptors and working buffer
            # on the next update, but I gave it up because it sometimes
            # caused illegal memory access error.
            spmv_desc_A = cusparse.SpMatDescriptor.create(A)
            spmv_desc_v = cusparse.DnVecDescriptor.create(v)
            spmv_desc_u = cusparse.DnVecDescriptor.create(u)
            buff_size = _cusparse.spMV_bufferSize(
                cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data,
                spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data,
                spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg)
            spmv_buff = cupy.empty(buff_size, cupy.int8)

        v[...] = V[i_start]
        for i in range(i_start, i_end):
            # Matrix-vector multiplication
            if cusparse_handle is None:
                u[...] = A @ v
            else:
                _cusparse.spMV(cusparse_handle, spmv_op_a,
                               spmv_alpha.ctypes.data, spmv_desc_A.desc,
                               spmv_desc_v.desc, spmv_beta.ctypes.data,
                               spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg,
                               spmv_buff.data.ptr)

            # Call dotc
            _cublas.setPointerMode(cublas_handle,
                                   _cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1,
                     alpha.data.ptr + i * alpha.itemsize)
            finally:
                _cublas.setPointerMode(cublas_handle, cublas_pointer_mode)

            # Orthogonalize
            gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1,
                 i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n,
                 zero.ctypes.data, uu.data.ptr, 1)
            gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1,
                 i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1,
                 one.ctypes.data, u.data.ptr, n)

            # Call nrm2
            _cublas.setPointerMode(cublas_handle,
                                   _cublas.CUBLAS_POINTER_MODE_DEVICE)
            try:
                nrm2(cublas_handle, n, u.data.ptr, 1,
                     beta.data.ptr + i * beta.itemsize)
            finally:
                _cublas.setPointerMode(cublas_handle, cublas_pointer_mode)

            # Break here as the normalization below touches V[i+1]
            if i >= i_end - 1:
                break

            if beta[i] < beta_eps:
                V[i + 1:i_end, :] = 0
                u[...] = 0
                v[...] = 0
                break
            if i == i_start:
                beta_eps *= beta[i]  # scale eps to largest beta

            # Normalize
            _kernel_normalize(u, beta, i, n, v, V)
Пример #16
0
def syrk(trans, a, out=None, alpha=1.0, beta=0.0, lower=False):
    """Computes out := alpha*op1(a)*op2(a) + beta*out

    op1(a) = a if trans is 'N', op2(a) = a.T if transa is 'N'
    op1(a) = a.T if trans is 'T', op2(a) = a if transa is 'T'
    lower specifies  whether  the  upper  or  lower triangular
    part  of the  array  out  is to be  referenced
    """
    assert a.ndim == 2
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.ssyrk
    elif dtype == 'd':
        func = cublas.dsyrk
    elif dtype == 'F':
        func = cublas.csyrk
    elif dtype == 'D':
        func = cublas.zsyrk
    else:
        raise TypeError('invalid dtype')

    trans = _trans_to_cublas_op(trans)
    if trans == cublas.CUBLAS_OP_N:
        n, k = a.shape
    else:
        k, n = a.shape
    if out is None:
        out = cupy.zeros((n, n), dtype=dtype, order='F')
        beta = 0.0
    else:
        assert out.ndim == 2
        assert out.shape == (n, n)
        assert out.dtype == dtype

    if lower:
        uplo = cublas.CUBLAS_FILL_MODE_LOWER
    else:
        uplo = cublas.CUBLAS_FILL_MODE_UPPER

    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    lda, trans = _decide_ld_and_trans(a, trans)
    ldo, _ = _decide_ld_and_trans(out, trans)
    if out._c_contiguous:
        if not a._c_contiguous:
            a = a.copy(order='C')
            trans = 1 - trans
            lda = a.shape[1]
        try:
            func(handle, 1 - uplo, trans, n, k,
                 alpha_ptr, a.data.ptr, lda,
                 beta_ptr, out.data.ptr, ldo)
        finally:
            cublas.setPointerMode(handle, orig_mode)

    else:
        if not a._f_contiguous:
            a = a.copy(order='F')
            lda = a.shape[0]
            trans = 1 - trans
        c = out
        if not out._f_contiguous:
            c = out.copy(order='F')
        try:
            func(handle, uplo, trans, n, k,
                 alpha_ptr, a.data.ptr, lda,
                 beta_ptr, out.data.ptr, ldo)
        finally:
            cublas.setPointerMode(handle, orig_mode)
        if not out._f_contiguous:
            out[...] = c
    return out
Пример #17
0
def geam(transa, transb, alpha, a, beta, b, out=None):
    """Computes alpha * op(a) + beta * op(b)

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    """
    assert a.ndim == b.ndim == 2
    assert a.dtype == b.dtype
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.sgeam
    elif dtype == 'd':
        func = cublas.dgeam
    elif dtype == 'F':
        func = cublas.cgeam
    elif dtype == 'D':
        func = cublas.zgeam
    else:
        raise TypeError('invalid dtype')

    transa = _trans_to_cublas_op(transa)
    transb = _trans_to_cublas_op(transb)
    if transa == cublas.CUBLAS_OP_N:
        m, n = a.shape
    else:
        n, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        assert b.shape == (m, n)
    else:
        assert b.shape == (n, m)
    if out is None:
        out = cupy.empty((m, n), dtype=dtype, order='F')
    else:
        assert out.ndim == 2
        assert out.shape == (m, n)
        assert out.dtype == dtype

    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    lda, transa = _decide_ld_and_trans(a, transa)
    ldb, transb = _decide_ld_and_trans(b, transb)
    if not (lda is None or ldb is None):
        if out._f_contiguous:
            try:
                func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr,
                     lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, m)
            finally:
                cublas.setPointerMode(handle, orig_mode)
            return out
        elif out._c_contiguous:
            # Computes alpha * a.T + beta * b.T
            try:
                func(handle, 1-transa, 1-transb, n, m, alpha_ptr, a.data.ptr,
                     lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, n)
            finally:
                cublas.setPointerMode(handle, orig_mode)
            return out

    a, lda = _change_order_if_necessary(a, lda)
    b, ldb = _change_order_if_necessary(b, ldb)
    c = out
    if not out._f_contiguous:
        c = out.copy(order='F')
    try:
        func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda,
             beta_ptr, b.data.ptr, ldb, c.data.ptr, m)
    finally:
        cublas.setPointerMode(handle, orig_mode)
    if not out._f_contiguous:
        _core.elementwise_copy(c, out)
    return out
Пример #18
0
def gemv(transa, alpha, a, x, beta, y):
    """Computes y = alpha * op(a) @ x + beta * y

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.

    Note: ''y'' will be updated.
    """
    dtype = a.dtype.char
    if dtype == 'f':
        func = cublas.sgemv
    elif dtype == 'd':
        func = cublas.dgemv
    elif dtype == 'F':
        func = cublas.cgemv
    elif dtype == 'D':
        func = cublas.zgemv
    else:
        raise TypeError('invalid dtype')
    assert a.ndim == 2
    assert x.ndim == y.ndim == 1
    assert a.dtype == x.dtype == y.dtype
    m, n = a.shape
    transa = _trans_to_cublas_op(transa)
    if transa == cublas.CUBLAS_OP_N:
        xlen, ylen = n, m
    else:
        xlen, ylen = m, n
    assert x.shape[0] == xlen
    assert y.shape[0] == ylen

    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    try:
        if a._f_contiguous:
            func(handle, transa, m, n, alpha_ptr, a.data.ptr, m, x.data.ptr, 1,
                 beta_ptr, y.data.ptr, 1)
        elif a._c_contiguous and transa != cublas.CUBLAS_OP_C:
            if transa == cublas.CUBLAS_OP_N:
                transa = cublas.CUBLAS_OP_T
            else:
                transa = cublas.CUBLAS_OP_N
            func(handle, transa, n, m, alpha_ptr, a.data.ptr, n, x.data.ptr, 1,
                 beta_ptr, y.data.ptr, 1)
        else:
            a = a.copy(order='F')
            func(handle, transa, m, n, alpha_ptr, a.data.ptr, m, x.data.ptr, 1,
                 beta_ptr, y.data.ptr, 1)
    finally:
        cublas.setPointerMode(handle, orig_mode)