def test_cublas_bug(): ''' The SGEMM call would cause all calls after it to fail for some unknown reason. Likely this is caused swaprows causing memory corruption. NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be fixed in CUDA 6.5 ''' from pycuda.driver import Stream from skcuda.cublas import cublasSgemm from skcuda.misc import _global_cublas_handle as handle n = 131 s = slice(128, n) X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32)) a = gpuarray.empty((X.shape[1], 3), dtype=np.float32) c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32) b = gpuarray.empty_like(X) m, n = a.shape[0], b[s].shape[1] k = a.shape[1] lda = m ldb = k ldc = m #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc #gpuarray.dot(d, Xoutd[s]) #op.sgemm(a, b[s], c) stream = Stream() stream.synchronize()
def test_todense_stream(): ''' Test GPUCSRArray.todense()''' X = np.random.laplace(size=(600, 300)).astype(np.float32) X[X < 0.1] = 0 X = csr_matrix(X, dtype=np.float32) Xd = GPUCSRArray(X) stream = Stream() Yd = Xd.todense(stream=stream) stream.synchronize() assert_allclose(Yd.get(), X.A, rtol=1e-3, err_msg="todense")
def test_todense_stream(): ''' Test GPUCSRArray.todense()''' X = np.random.laplace(size=(600, 300)).astype(np.float32) X[X<0.1] = 0 X = csr_matrix(X, dtype=np.float32) Xd = GPUCSRArray(X) stream = Stream() Yd = Xd.todense(stream=stream) stream.synchronize() assert_allclose(Yd.get(), X.A, rtol=1e-3, err_msg="todense")
def test_cusparseSetStream(): A = np.random.laplace(size=(3, 5)).astype(np.float32) A[A<0.1] = 0 A = sparse.csr_matrix(A, dtype=np.float32) A.sort_indices() a_data = gpu.to_gpu(A.data) a_indptr = gpu.to_gpu(A.indptr) a_indices = gpu.to_gpu(A.indices) out = gpu.empty((A.shape[0], A.shape[1]), dtype=A.dtype, order="F") h = cusparse.cusparseCreate() descrA = cusparse.cusparseCreateMatDescr() stream = Stream() cusparse.cusparseSetStream(h, stream.handle) cusparse.cusparseScsr2dense(h, A.shape[0], A.shape[1], descrA, a_data.gpudata, a_indptr.gpudata, a_indices.gpudata, out.gpudata, out.shape[0]) cusparse.cusparseSetStream(h, 0) stream.synchronize() assert_allclose(out.get(), A.A, rtol=1e-4)