def test_matmul_fast(self): blocksize = 20 gridsize = 20 @hsa.jit def matmulfast(A, B, C): x = hsa.get_global_id(0) y = hsa.get_global_id(1) tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) C[x, y] = tmp N = gridsize * blocksize A = np.random.random((N, N)).astype(np.float32) B = np.random.random((N, N)).astype(np.float32) C = np.zeros_like(A) griddim = gridsize, gridsize blockdim = blocksize, blocksize with hsa.register(A, B, C): ts = timer() matmulfast[griddim, blockdim](A, B, C) te = timer() print("1st GPU time:", te - ts) with hsa.register(A, B, C): ts = timer() matmulfast[griddim, blockdim](A, B, C) te = timer() print("2nd GPU time:", te - ts) ts = timer() ans = np.dot(A, B) te = timer() print("CPU time:", te - ts) np.testing.assert_allclose(ans, C, rtol=1e-5)
def test_matmul_fast(self): blocksize = 20 gridsize = 20 @hsa.jit def matmulfast(A, B, C): x = hsa.get_global_id(0) y = hsa.get_global_id(1) tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end hsa.barrier(1) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end hsa.barrier(1) C[x, y] = tmp N = gridsize * blocksize A = np.random.random((N, N)).astype(np.float32) B = np.random.random((N, N)).astype(np.float32) C = np.zeros_like(A) griddim = gridsize, gridsize blockdim = blocksize, blocksize with hsa.register(A, B, C): ts = timer() matmulfast[griddim, blockdim](A, B, C) te = timer() print("1st GPU time:", te - ts) with hsa.register(A, B, C): ts = timer() matmulfast[griddim, blockdim](A, B, C) te = timer() print("2nd GPU time:", te - ts) ts = timer() ans = np.dot(A, B) te = timer() print("CPU time:", te - ts) np.testing.assert_allclose(ans, C, rtol=1e-5)
def launcher(support, samples, bandwidth, pdf): assert pdf.ndim == 1 assert support.ndim == 1 assert samples.ndim == 1 assert support.size == pdf.size with hsa.register(support, samples, pdf): threads = WAVESIZE * 8 blocks = support.size hsa_uni_kde[blocks, threads](support, samples, bandwidth, pdf)
def launcher(support, samples, bandwidths, pdf): assert support.shape[0] == pdf.size assert support.shape[1] == samples.shape[1] assert bandwidths.size == support.shape[1] threads = WAVESIZE * 4 blocks = (support.shape[0] + threads - 1) // threads with hsa.register(support, samples, bandwidths, pdf): hsa_multi_kde[blocks, threads](support, samples, bandwidths, pdf)
def test_matmul_naive(self): @hsa.jit def matmul(A, B, C): i = hsa.get_global_id(0) j = hsa.get_global_id(1) if i >= C.shape[0] or j >= C.shape[1]: return tmp = 0 for k in range(A.shape[1]): tmp += A[i, k] * B[k, j] C[i, j] = tmp N = 256 A = np.random.random((N, N)).astype(np.float32) B = np.random.random((N, N)).astype(np.float32) C = np.zeros_like(A) with hsa.register(A, B, C): ts = timer() matmul[(N // 16, N // 16), (16, 16)](A, B, C) te = timer() print("1st GPU time:", te - ts) with hsa.register(A, B, C): ts = timer() matmul[(N // 16, N // 16), (16, 16)](A, B, C) te = timer() print("2nd GPU time:", te - ts) ts = timer() ans = np.dot(A, B) te = timer() print("CPU time:", te - ts) np.testing.assert_allclose(ans, C, rtol=1e-5)