def test_cublas_bug(): ''' The SGEMM call would cause all calls after it to fail for some unknown reason. Likely this is caused swaprows causing memory corruption. NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be fixed in CUDA 6.5 ''' from pycuda.driver import Stream from skcuda.cublas import cublasSgemm from skcuda.misc import _global_cublas_handle as handle n = 131 s = slice(128, n) X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32)) a = gpuarray.empty((X.shape[1], 3), dtype=np.float32) c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32) b = gpuarray.empty_like(X) m, n = a.shape[0], b[s].shape[1] k = a.shape[1] lda = m ldb = k ldc = m #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc) #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc #gpuarray.dot(d, Xoutd[s]) #op.sgemm(a, b[s], c) stream = Stream() stream.synchronize()
def test_todense_stream(): ''' Test GPUCSRArray.todense()''' X = np.random.laplace(size=(600, 300)).astype(np.float32) X[X < 0.1] = 0 X = csr_matrix(X, dtype=np.float32) Xd = GPUCSRArray(X) stream = Stream() Yd = Xd.todense(stream=stream) stream.synchronize() assert_allclose(Yd.get(), X.A, rtol=1e-3, err_msg="todense")
def test_todense_stream(): ''' Test GPUCSRArray.todense()''' X = np.random.laplace(size=(600, 300)).astype(np.float32) X[X<0.1] = 0 X = csr_matrix(X, dtype=np.float32) Xd = GPUCSRArray(X) stream = Stream() Yd = Xd.todense(stream=stream) stream.synchronize() assert_allclose(Yd.get(), X.A, rtol=1e-3, err_msg="todense")
def test_cusparseSetStream(): A = np.random.laplace(size=(3, 5)).astype(np.float32) A[A<0.1] = 0 A = sparse.csr_matrix(A, dtype=np.float32) A.sort_indices() a_data = gpu.to_gpu(A.data) a_indptr = gpu.to_gpu(A.indptr) a_indices = gpu.to_gpu(A.indices) out = gpu.empty((A.shape[0], A.shape[1]), dtype=A.dtype, order="F") h = cusparse.cusparseCreate() descrA = cusparse.cusparseCreateMatDescr() stream = Stream() cusparse.cusparseSetStream(h, stream.handle) cusparse.cusparseScsr2dense(h, A.shape[0], A.shape[1], descrA, a_data.gpudata, a_indptr.gpudata, a_indices.gpudata, out.gpudata, out.shape[0]) cusparse.cusparseSetStream(h, 0) stream.synchronize() assert_allclose(out.get(), A.A, rtol=1e-4)
def test_unary_func_kwargs(self): """tests if the kwargs to the unary functions work""" from pycuda.driver import Stream name, a, b, threshold = ("exp", -3, 3, 1e-5) gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) for s in sizes: for dtype in dtypes: np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) if complex: A = A + (np.random.random(s) * (b - a) + a) * 1j np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) # 'out' kw gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, out=gpu_results).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'out' position gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, gpu_results).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'stream' kw mystream = Stream() np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, stream=mystream).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'stream' position mystream = Stream() np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, mystream).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'out' and 'stream' kw mystream = Stream() np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, stream=mystream, out=gpu_results).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype)