def GPUarray_order(garray, order="F"): """ will set the order of garray in place """ if order=="F": if garray.flags.f_contiguous: exit else: garray.strides = gpuarray._f_contiguous_strides( garray.dtype.itemsize, garray.shape) garray.flags.f_contiguous = True garray.flags.c_contiguous = False elif order=="C": if garray.flags.c_contiguous: exit else: garray.strides = gpuarray._c_contiguous_strides( garray.dtype.itemsize, garray.shape) garray.flags.c_contiguous = True garray.flags.f_contiguous = False
def GPUarray_order(garray, order="F"): """ will set the order of garray in place """ if order == "F": if garray.flags.f_contiguous: exit else: garray.strides = gpuarray._f_contiguous_strides( garray.dtype.itemsize, garray.shape) garray.flags.f_contiguous = True garray.flags.c_contiguous = False elif order == "C": if garray.flags.c_contiguous: exit else: garray.strides = gpuarray._c_contiguous_strides( garray.dtype.itemsize, garray.shape) garray.flags.c_contiguous = True garray.flags.f_contiguous = False
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): ''' Calculates C += alpha * A*B + beta*C. Where B is sparse and both A and B can be transposed. Note: cuSPARSE only allows for sparse A, so we need some tricks: Essentially, we will compute C^T = B^T * A^T By enforcing C to be row-major, can drop its transpose since cuSPARSE assumes column-major. Thus, we only need to compute C = op(B)^T * op(A)^T ''' assert C_gpu.flags.c_contiguous m, k = B_gpu.shape ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE if transA: if ta: # we can't have ta and tb true at the same time according to cuSPARSE docs out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype) cublas.cublasSgeam(cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1], 0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0]) out.shape = A_gpu.shape[1], A_gpu.shape[0] out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape) A_gpu = out tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE n = A_gpu.shape[1] else: tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] ldb = A_gpu.shape[1] ldc = C_gpu.shape[1] cusparse.cusparseScsrmm2(cusparse_handle, ta, tb, m, n, k, B_gpu.nnz, alpha, B_gpu.descr, B_gpu.data.gpudata, B_gpu.indptr.gpudata, B_gpu.indices.gpudata, A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc) return C_gpu
def csrmmB(A_gpu, B_gpu, C_gpu, transA=False, transB=False, alpha=1.0, beta=0.0): """ Calculates C += alpha * A*B + beta*C. Where B is sparse and both A and B can be transposed. Note: cuSPARSE only allows for sparse A, so we need some tricks: Essentially, we will compute C^T = B^T * A^T By enforcing C to be row-major, can drop its transpose since cuSPARSE assumes column-major. Thus, we only need to compute C = op(B)^T * op(A)^T """ assert C_gpu.flags.c_contiguous m, k = B_gpu.shape ta = cusparse.CUSPARSE_OPERATION_TRANSPOSE if not transB else cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE if transA: if ta: # we can't have ta and tb true at the same time according to cuSPARSE docs out = __cuda_get_temp_matrix(A_gpu.shape, A_gpu.dtype) cublas.cublasSgeam( cublas_handle, 1, 1, A_gpu.shape[0], A_gpu.shape[1], 1.0, A_gpu.gpudata, A_gpu.shape[1], 0.0, A_gpu.gpudata, A_gpu.shape[1], out.gpudata, A_gpu.shape[0], ) out.shape = A_gpu.shape[1], A_gpu.shape[0] out.strides = gpuarray._c_contiguous_strides(out.dtype.itemsize, out.shape) A_gpu = out tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] else: tb = cusparse.CUSPARSE_OPERATION_TRANSPOSE n = A_gpu.shape[1] else: tb = cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE n = A_gpu.shape[0] ldb = A_gpu.shape[1] ldc = C_gpu.shape[1] cusparse.cusparseScsrmm2( cusparse_handle, ta, tb, m, n, k, B_gpu.nnz, alpha, B_gpu.descr, B_gpu.data.gpudata, B_gpu.indptr.gpudata, B_gpu.indices.gpudata, A_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc, ) return C_gpu