示例#1
0
def GPUarray_order(garray, order="F"):
    """
    will set the order of garray in place
    """
    if order=="F":
        if garray.flags.f_contiguous:
            exit
        else:
            garray.strides = gpuarray._f_contiguous_strides(
                garray.dtype.itemsize, garray.shape)
            garray.flags.f_contiguous = True
            garray.flags.c_contiguous = False
    elif order=="C":
        if garray.flags.c_contiguous:
            exit
        else:
            garray.strides = gpuarray._c_contiguous_strides(
                garray.dtype.itemsize, garray.shape)
            garray.flags.c_contiguous = True
            garray.flags.f_contiguous = False
示例#2
0
文件: util.py 项目: xiangze/gpustats
def GPUarray_order(garray, order="F"):
    """
    will set the order of garray in place
    """
    if order == "F":
        if garray.flags.f_contiguous:
            exit
        else:
            garray.strides = gpuarray._f_contiguous_strides(
                garray.dtype.itemsize, garray.shape)
            garray.flags.f_contiguous = True
            garray.flags.c_contiguous = False
    elif order == "C":
        if garray.flags.c_contiguous:
            exit
        else:
            garray.strides = gpuarray._c_contiguous_strides(
                garray.dtype.itemsize, garray.shape)
            garray.flags.c_contiguous = True
            garray.flags.f_contiguous = False
示例#3
0
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0):
    ''' Calculates C += alpha * A*B + beta*C.
        Where B is sparse and both A and B can be transposed.

        Note: cuSPARSE only allows for sparse A, so we need some tricks:
            Essentially, we will compute C^T = B^T * A^T
            By enforcing C to be row-major, can drop its transpose
            since cuSPARSE assumes column-major. Thus, we only need to
            compute
            C = op(B)^T * op(A)^T
    '''
    assert C_gpu.flags.c_contiguous
    m, k = B_gpu.shape
    ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE

    if transA:
        if ta:  # we can't have ta and tb true at the same time according to cuSPARSE docs
            out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype)
            cublas.cublasSgeam(cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1],
                               0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0])
            out.shape = A_gpu.shape[1], A_gpu.shape[0]
            out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape)
            A_gpu = out
            tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
            n = A_gpu.shape[0]
        else:
            tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE
            n = A_gpu.shape[1]
    else:
        tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        n = A_gpu.shape[0]

    ldb = A_gpu.shape[1]
    ldc = C_gpu.shape[1]

    cusparse.cusparseScsrmm2(cusparse_handle, ta, tb,
        m, n, k, B_gpu.nnz, alpha,
        B_gpu.descr, B_gpu.data.gpudata, B_gpu.indptr.gpudata, B_gpu.indices.gpudata,
        A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)
    return C_gpu
示例#4
0
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0):
    """ Calculates C += alpha * A*B + beta*C.
        Where B is sparse and both A and B can be transposed.

        Note: cuSPARSE only allows for sparse A, so we need some tricks:
            Essentially, we will compute C^T = B^T * A^T
            By enforcing C to be row-major, can drop its transpose
            since cuSPARSE assumes column-major. Thus, we only need to
            compute
            C = op(B)^T * op(A)^T
    """
    assert C_gpu.flags.c_contiguous
    m, k = B_gpu.shape
    ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE

    if transA:
        if ta:  # we can't have ta and tb true at the same time according to cuSPARSE docs
            out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype)
            cublas.cublasSgeam(
                cublas_handle,
                1,
                1,
                A_gpu.shape[0],
                A_gpu.shape[1],
                1.0,
                A_gpu.gpudata,
                A_gpu.shape[1],
                0.0,
                A_gpu.gpudata,
                A_gpu.shape[1],
                out.gpudata,
                A_gpu.shape[0],
            )
            out.shape = A_gpu.shape[1], A_gpu.shape[0]
            out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape)
            A_gpu = out
            tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
            n = A_gpu.shape[0]
        else:
            tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE
            n = A_gpu.shape[1]
    else:
        tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
        n = A_gpu.shape[0]

    ldb = A_gpu.shape[1]
    ldc = C_gpu.shape[1]

    cusparse.cusparseScsrmm2(
        cusparse_handle,
        ta,
        tb,
        m,
        n,
        k,
        B_gpu.nnz,
        alpha,
        B_gpu.descr,
        B_gpu.data.gpudata,
        B_gpu.indptr.gpudata,
        B_gpu.indices.gpudata,
        A_gpu.gpudata,
        ldb,
        beta,
        C_gpu.gpudata,
        ldc,
    )
    return C_gpu