def tune_energy(kernel_string, height, width, crosscorr, loc): """step 4 compute energy""" tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["num_blocks"] = [2**i for i in range(5,11)] max_blocks = max(tune_params["num_blocks"]) params = {"block_size_x": 512, "num_blocks": 64} num_blocks = np.int32(params["num_blocks"]) problem_size = ("num_blocks", 1) energy_part = np.zeros((max_blocks), dtype=np.float64) args = [height, width, energy_part, loc, crosscorr] output3 = run_kernel("computeEnergy", kernel_string, problem_size, args, params, grid_div_x=[]) tune_kernel("computeEnergy", kernel_string, problem_size, args, tune_params, grid_div_x=[]) energy_part = output3[2] energy = np.zeros((1), dtype=np.float64) args = [energy, energy_part, num_blocks] output4 = run_kernel("sumDoubles", kernel_string, (1,1), args, params, grid_div_x=[]) energy = output4[0] return energy
def tune_find_peak(kernel_string, height, width, crosscorr): """step 3 find peak""" tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["num_blocks"] = [2**i for i in range(5,11)] max_blocks = max(tune_params["num_blocks"]) params = {"block_size_x": 512, "num_blocks": 64} num_blocks = np.int32(params["num_blocks"]) problem_size = ("num_blocks", 1) peakval = np.zeros((1), dtype=np.float32) peakvals = np.zeros((max_blocks), dtype=np.float32) peakindx = np.zeros((max_blocks), dtype=np.int32) loc = np.zeros((1), dtype=np.int32) val = np.zeros((1), dtype=np.float32) args = [height, width, peakval, peakvals, peakindx, crosscorr] output1 = run_kernel("findPeak", kernel_string, problem_size, args, params, grid_div_x=[]) tune_kernel("findPeak", kernel_string, problem_size, args, tune_params, grid_div_x=[]) peakvals = output1[2] peakindx = output1[3] args = [loc, val, peakindx, peakvals, num_blocks] output2 = run_kernel("maxlocFloats", kernel_string, (1,1), args, params, grid_div_x=[]) loc = output2[0] val = output2[1] return loc, val
def test_wiener(): with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output = np.zeros(problem_size, dtype=np.float32) args = [height, width, output, image] params = OrderedDict() params["block_size_x"] = 32 params["block_size_y"] = 8 params["reuse_computation"] = 1 answer = run_kernel("computeVarianceEstimates", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"]) reference = run_kernel("computeVarianceEstimates_naive", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"]) assert np.allclose(answer[2], reference[2], atol=1e-6)
def test_prefix_sum_kernel(): skip_if_no_cuda_device() with open(get_kernel_path()+'prefixsum.cu', 'r') as f: kernel_string = f.read() size = 256 problem_size = (size, 1) params = {"block_size_x": 128} max_blocks = size//params["block_size_x"] x = np.ones(size).astype(np.int32) #compute reference answer reference = np.cumsum(x) #setup kernel inputs prefix_sums = np.zeros(size).astype(np.int32) block_carry = np.zeros(max_blocks).astype(np.int32) n = np.int32(size) args = [prefix_sums, block_carry, x, n] #call the first kernel that computes the incomplete prefix sums #and outputs the block carry values result = run_kernel("prefix_sum_block", kernel_string, problem_size, args, params) prefix_sums = result[0] block_filler = np.zeros(max_blocks).astype(np.int32) block_out = np.zeros(max_blocks).astype(np.int32) args = [block_out, block_filler, result[1], np.int32(max_blocks)] #call the kernel again, but this time on the block carry values #one thread block should be sufficient if max_blocks > params["block_size_x"]: print("warning: block size too small") result = run_kernel("prefix_sum_block", kernel_string, (1, 1), args, params, grid_div_x=[]) block_carry = result[0] args = [prefix_sums, block_carry, n] #call a simple kernel to propagate the block carry values to all #elements answer = run_kernel("propagate_block_carry", kernel_string, problem_size, args, params) #verify test_result = np.sum(answer[0] - reference) == 0 print("answer") print(answer[0]) print("reference") print(reference) assert test_result
def test_find_peak(): with open(get_kernel_path() + 'peaktocorrelationenergy.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test_small.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) #generate some bogus crosscorr data crosscorr = np.random.randn(height, width, 2).astype(np.float32) #compute reference in Python peak_index = np.argmax(np.absolute(crosscorr[:, :, 0])) peak_value = np.absolute(crosscorr[:, :, 0].flatten()[peak_index]) params = {"block_size_x": 512, "num_blocks": 64} problem_size = ("num_blocks", 1) num_blocks = np.int32(params["num_blocks"]) peakval = np.zeros((1), dtype=np.float32) peakvals = np.zeros((num_blocks), dtype=np.float32) peakindx = np.zeros((num_blocks), dtype=np.int32) loc = np.zeros((1), dtype=np.int32) val = np.zeros((1), dtype=np.float32) args = [height, width, peakval, peakvals, peakindx, crosscorr] output1 = run_kernel("findPeak", kernel_string, problem_size, args, params, grid_div_x=[]) peakvals = output1[3] peakindx = output1[4] args = [loc, val, peakindx, peakvals, num_blocks] output2 = run_kernel("maxlocFloats", kernel_string, (1, 1), args, params, grid_div_x=[]) loc = output2[0][0] val = output2[1][0] print("answer") print("loc=", loc, "val=", val) print("reference") print("loc=", peak_index, "val=", peak_value) assert loc == peak_index assert np.isclose(val, peak_value, atol=1e-6)
def test_against_reference(size, ndim, A, B, scale, grad, cost): numpy.set_printoptions(edgeitems=50) #call the reference function ref_cost, ref_gradient = call_reference_function(size, ndim, A, B, scale, grad, cost) #call the GPU function with open(get_kernel_path() + 'kernels.cu', 'r') as f: kernel_string = f.read() scale_sq = (scale * scale).astype(numpy.float64) m = numpy.int32(size) n = numpy.int32(B.shape[0]) arguments = [A, B, m, n, scale_sq, grad, cost] params = {"block_size_x": 256} answer = run_kernel("GaussTransform", kernel_string, size, arguments, params, compiler_options=compiler_options, grid_div_x=[]) #collect the results from the first kernel grad_i = answer[5] gradient = grad_i cross_term = answer[6] #call the second kernel to reduce the per thread block cross terms to a single value out = numpy.zeros(1).astype(numpy.float64) arguments = [out, cross_term, m, n, m] answer = run_kernel("reduce_cross_term", kernel_string, 1, arguments, params, compiler_options=compiler_options, grid_div_x=[]) #final cross term cost = answer[0] print("answer") print(cost) print(gradient) assert numpy.isclose(ref_cost, cost, atol=1e-8) assert numpy.allclose(ref_gradient, gradient, atol=1e-8) return cost, gradient
def test_vector_add(): #Check pycuda is installed and if a CUDA capable device is present, if not skip the test try: import pycuda.driver as drv drv.init() except (ImportError, Exception): pytest.skip("PyCuda not installed or no CUDA device detected") kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 10000000 problem_size = (size, 1) a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] params = {"block_size_x": 512} answer = run_kernel("vector_add", kernel_string, problem_size, args, params) assert numpy.allclose(answer[0], a+b, atol=1e-8)
def tune_degrees_dense(): with open(get_kernel_path()+'degrees.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N) problem_size = (N,1) correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: qd_string = f.read() data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1}) correlations = data[0] sums = data[1] #partial sum of the # of correlated hits to hits later in time #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["window_width"] = [sliding_window_width] args = [sums, correlations, N] return tune_kernel("degrees_dense", kernel_string, problem_size, args, tune_params, verbose=True)
def test_radix(radix, c_type): # this test runs 256 instances of the radix n function # it does not use twiddle factors, so as a test # it's not to be relied upon fully n = numpy.int32(256) m = {'float2': 1, 'float4': 2, 'float8': 4}[c_type] x = numpy.random.normal(size=(n, radix, m, 2)).astype(numpy.float32) y = numpy.zeros_like(x) y_ref = numpy.fft.fft(x[..., 0] + 1j * x[..., 1], axis=1) parity_splitting = parity.ParitySplitting(radix * n, radix) codelets = "{}\n{}".format( generator.generate_preprocessor(parity_splitting, False, c_type=c_type), generator.generate_fma_codelets(parity_splitting, False, c_type=c_type)) args = [x, y, n] answer = run_kernel(f"test_radix_{radix}", codelets, 1, args, {}, compiler_options=["-DTESTING_RADIX"]) y = answer[1] y = y[..., 0] + 1j * y[..., 1] numpy.testing.assert_almost_equal(y, y_ref, decimal=5)
def test_hostfunction(): #setup test input size, ndim, A, B, scale, grad, cost = get_real_data() #size, ndim, A, B, scale, grad, cost = generate_inputs() #call the reference function ref_cost, ref_gradient = call_reference_function(size, ndim, A, B, scale, grad, cost) #call the host function m = numpy.int32(size) n = numpy.int32(B.shape[0]) arguments = [cost, A, B, m, n, ndim, scale, grad] with open(get_kernel_path() + 'gausstransform.cu', 'r') as f: kernel_string = f.read() answer = run_kernel("test_GaussTransformHost", kernel_string, size, arguments, {}, lang="C", compiler_options=compiler_options + ['-arch=sm_30']) cost = answer[0][0] print("reference") print(ref_cost) gradient = answer[7] print(ref_gradient) print("answer") print(cost) print(gradient) assert numpy.isclose(ref_cost, cost, atol=1e-8) assert numpy.allclose(ref_gradient, gradient, atol=1e-8)
def test_vector_add(): #Check pycuda is installed and if a CUDA capable device is present, if not skip the test try: import pycuda.driver as drv drv.init() except (ImportError, Exception): raise SkipTest("PyCuda not installed or no CUDA device detected") kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 10000000 problem_size = (size, 1) a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] params = {"block_size_x": 512} answer = run_kernel("vector_add", kernel_string, problem_size, args, params) assert numpy.allclose(answer[0], a + b, atol=1e-8)
def test_2n(): N = 1024 signal1, signal2 = np.random.normal(size=(2, N)).astype(np.float32) x = signal1 + 1j * signal2 y = np.fft.fft(x).astype(np.complex64) Xa, Xb = np.zeros((2, N), dtype=np.float32) _, Xa, Xb = run_kernel("test_fix_2n", "fft1024_mc_fma.cl", 1, [y, Xa, Xb], { "TESTING": 1, "block_size_x": 1 }) signal1_ref = np.fft.rfft(signal1)[:-1] signal2_ref = np.fft.rfft(signal2)[:-1] print(Xa) print(signal1_ref) assert abs(Xa.view(np.complex64) - signal1_ref).max() < 1e-3 print(Xb) print(signal2_ref) assert abs(Xb.view(np.complex64) - signal2_ref).max() < 1e-3
def test_2N_r2cfft(): N = 1024 signal1, signal2 = np.random.normal(size=(2, N)).astype(np.float32) x = np.c_[signal1, signal2] y = np.zeros_like(x) _, y = run_kernel("fft_1024", "fft1024_mc_fma.cl", 1, [x, y], { "TESTING": 1, "block_size_x": 1024 }) c = y[..., 0] + 1j * y[..., 1] Xa_r, Xa_i, Xb_r, Xb_i = fix_2n(c, N) signal1_ans = Xa_r + 1j * Xa_i signal2_ans = Xb_r + 1j * Xb_i signal1_ref = np.fft.rfft(signal1)[:-1] signal2_ref = np.fft.rfft(signal2)[:-1] print(signal1_ans) print(signal1_ref) assert abs(signal1_ans - signal1_ref).max() < 1e-3 print(signal2_ans) print(signal2_ref) assert abs(signal2_ans - signal2_ref).max() < 1e-3
def test(): with open('vector_add.F90', 'r') as f: kernel_string = f.read() size = 10000000 a = np.random.randn(size).astype(np.float32) b = np.random.randn(size).astype(np.float32) c = np.zeros_like(b) n = np.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["N"] = size tune_params["NTHREADS"] = 4 answer = run_kernel("vector_add", kernel_string, size, args, tune_params, lang="C", compiler="pgfortran") assert np.allclose(answer[0], a + b, atol=1e-8)
def test_quadratic_difference_kernel(): skip_if_no_cuda_device() with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: kernel_string = f.read() N = np.int32(300) sliding_window_width = np.int32(150) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N) correlations_ref = np.zeros((sliding_window_width, N), 'uint8') correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] #call the CUDA kernel params = { "block_size_x": 256, "write_sums": 1, 'window_width': sliding_window_width } answer = run_kernel("quadratic_difference_linear", kernel_string, problem_size, args, params) #compute reference answer correlations_ref = correlations_cpu(correlations_ref, x, y, z, ct) test_result = np.sum(answer[0] - correlations_ref) == 0 if not test_result == True: print("test quadratic_difference_linear FAILED, attempting to create a plot for visual comparison") create_plot(correlations_ref, answer[0]) assert test_result
def test(): cp = [ "-I/home/bwn200/eigen-git-mirror/", "-I/home/bwn200/cxxopts/include/", "-lcublas", "-lcurand" ] size = np.int32(1024) problem_size = (size, size) #C program assumes data is stored column-major A = np.random.randn(*problem_size).astype(np.float32, order='F') B = np.random.randn(*problem_size).astype(np.float32, order='F') C = np.zeros_like(A) args = [C, A, B, size] answer = run_kernel("call_cublas_gemm_basic_version", "gemm_cublas.cpp", 1, args, params={}, compiler_options=cp, compiler="nvcc", lang="C", log=logging.DEBUG) #numpy insists on returning the result in row-major, regardless of input #using a transpose as a quick fix, there should be a better solution expected = np.dot(A, B).T assert np.allclose(expected, answer[0], atol=1e-3)
def test_expdist_ref(): size = numpy.int32(100) ndim = numpy.int32(2) cost, A, B, scale_A, scale_B = generate_inputs(size, ndim, 1) arguments = [cost, A, B, size, size, ndim, scale_A, scale_B] with open(get_kernel_path() + 'expdist_c.cpp', 'r') as f: kernel_string = f.read() answer = run_kernel("call_expdist", kernel_string, size, arguments, {}, lang="C", compiler_options=['-I' + get_kernel_path()]) cost = call_reference_function(size, ndim, A, B, scale_A, scale_B, cost) print("cost") print(cost) print("A") print(A) print("B") print(B) print("scale_A") print(scale_A) print("scale_B") print(scale_B) assert 100.0 < cost and cost < 200.0
def test_dot_product(): function_name = "dot_product" a = np.random.randn(3).astype(np.float64) b = np.random.randn(3).astype(np.float64) c = np.zeros((1), dtype=np.float64) args = [c, a, b] convert = [True, True, True] kernel_string = generate_wrapper(function_name, filename, args, convert_to_array=convert) answer = run_kernel("call_function", kernel_string, 1, args, {}, lang="C", compiler_options=cp, compiler="nvcc") expected = a.dot(b) print("answer") print(answer[0]) print("expected") print(expected) assert np.allclose(answer[0], expected, atol=1e-6)
def test_multiply_matrix(): function_name = "multiply_matrix" a = np.random.randn(9).astype(np.float64) b = np.random.randn(9).astype(np.float64) c = np.zeros_like(a) #args = [c, a, b, np.int32(3)] args = [c, a, b] convert = [True for _ in args] #convert[-1] = False template_parameters = "double, 9, 3" kernel_string = generate_wrapper(function_name, filename, args, convert_to_array=convert, template_parameters=template_parameters) answer = run_kernel("call_function", kernel_string, 1, args, {}, lang="C", compiler_options=cp, compiler="nvcc") expected = a.reshape(3,3).dot(b.reshape(3,3)) print("answer") print(answer[0].reshape(3,3)) print("expected") print(expected) assert np.allclose(answer[0].reshape(3,3), expected, atol=1e-6)
def test_multiply_matrix(): function_name = "multiply_matrix" with open('matrix_multiply.cpp', 'r') as f: kernel_string = f.read() a = np.random.randn(9).astype(np.float64) b = np.random.randn(9).astype(np.float64) c = np.zeros_like(a) args = [c, a, b, np.int32(3)] convert = [True for _ in args] convert[-1] = False #generate a wrapper function with "extern C" binding that can be called from Python kernel_string = wrappers.cpp(function_name, kernel_string, args, convert_to_array=convert) answer = run_kernel(function_name + "_wrapper", kernel_string, 1, args, {}, lang="C") #compute expected answer of matrix multiplication with Numpy expected = a.reshape(3, 3).dot(b.reshape(3, 3)) assert np.allclose(answer[0].reshape(3, 3), expected)
def test_add_matrix(): function_name = "add_matrix" a = np.random.randn(9).astype(np.float64) b = np.random.randn(9).astype(np.float64) c = np.zeros_like(a) args = [c, a, b] convert = [True for _ in args] kernel_string = generate_wrapper(function_name, filename, args, convert_to_array=convert) answer = run_kernel("call_function", kernel_string, 1, args, {}, lang="C", compiler_options=cp, compiler="nvcc") expected = a + b print("answer") print(answer[0]) print("expected") print(expected) assert np.allclose(answer[0], expected, atol=1e-6)
def test_prefix_sum_single_block(): skip_if_no_cuda_device() with open(get_kernel_path()+'prefixsum.cu', 'r') as f: kernel_string = f.read() size = 487 problem_size = (size, 1) params = {"block_size_x": 128} max_blocks = size//params["block_size_x"] x = np.ones(size).astype(np.int32) #compute reference answer reference = np.cumsum(x) #setup kernel inputs prefix_sums = np.zeros(size).astype(np.int32) block_carry = np.zeros(max_blocks).astype(np.int32) n = np.int32(size) args = [prefix_sums, block_carry, x, n] #call the first kernel that computes the incomplete prefix sums #and outputs the block carry values answer = run_kernel("prefix_sum_single_block", kernel_string, (1,1), args, params) #verify test_result = np.sum(answer[0] - reference) == 0 print("answer") print(answer[0]) print("reference") print(reference) assert test_result
def test_propagate_block_carry(): skip_if_no_cuda_device() with open(get_kernel_path()+'prefixsum.cu', 'r') as f: kernel_string = f.read() size = 1000 n = np.int32(size) params = {"block_size_x": 256} inputs = (np.random.rand(size)*100.0).astype(np.int32) block_carry = (np.random.rand(size//params["block_size_x"]+1)*100.0).astype(np.int32) args = [inputs, block_carry, n] answer = run_kernel("propagate_block_carry", kernel_string, (size,1), args, params) reference = inputs.copy() bs = params["block_size_x"] reference[:bs] = inputs[:bs] reference[bs:] = [inputs[i] + block_carry[i//bs-1] for i in range(bs,size)] print(block_carry) print(answer[0]) print(reference) assert all(answer[0] == reference)
def test_gausstransform_ref(): size = numpy.int32(2000) ndim = numpy.int32(2) A = numpy.random.randn(size * ndim).astype(numpy.float64) B = numpy.random.randn(size * ndim).astype(numpy.float64) scale = numpy.float64(10.0) grad = numpy.zeros(size * ndim).astype(numpy.float64) cost = numpy.zeros((1)).astype(numpy.float64) arguments = [cost, A, B, size, size, ndim, scale, grad] with open(get_kernel_path('gausstransform') + 'gausstransform_c.cpp', 'r') as f: kernel_string = f.read() answer = run_kernel( "call_GaussTransform", kernel_string, size, arguments, {}, lang="C", compiler_options=['-I' + get_kernel_path('gausstransform')]) cost = answer[0] print(cost) assert 1.0 > cost and cost > 0.0 gradient = answer[7] print(gradient)
def tune_crosscorr(kernel_string, height, width, image_freq, image2_freq): """step 2 Fourier transforms and cross correlation""" problem_size = (width, height) tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] tune_params["block_size_y"] = [2**i for i in range(6)] image_freq = image_freq.reshape(height,width,2) image_freq = image_freq[:,:,0] + 1j * image_freq[:,:,1] image_freq = fft2(image_freq).astype(np.complex64) image2_freq = image2_freq.reshape(height,width,2) image2_freq = image2_freq[:,:,0] + 1j * image2_freq[:,:,1] image2_freq = fft2(image2_freq).astype(np.complex64) crosscorr = np.zeros((height,width,2), dtype=np.float32) args = [height, width, crosscorr, image_freq, image2_freq] params = {"block_size_x": 32, "block_size_y": 16} output = run_kernel("computeCrossCorr", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"]) tune_kernel("computeCrossCorr", kernel_string, problem_size, args, tune_params, grid_div_y=["block_size_y"]) crosscorr = output[2].reshape(height,width,2) crosscorr_invert = crosscorr[:,:,0] + 1j * crosscorr[:,:,1] crosscorr_invert = ifft2(crosscorr_invert) crosscorr[:,:,0] = crosscorr_invert.real crosscorr[:,:,1] = crosscorr_invert.imag return crosscorr
def call_reference_kernel(N, B, T, K, F, args): problem_size = B params = {'block_size_x': 32, "use_kernel": 1} answer = run_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu", problem_size, args, params, compiler_options=cp) return answer
def test_dense2sparse_kernel(): skip_if_no_cuda_device() with open(get_kernel_path()+'dense2sparse.cu', 'r') as f: kernel_string = f.read() N = np.int32(300) sliding_window_width = np.int32(150) problem_size = (N, 1) #generate input data with an expected density of correlated hits correlations = generate_correlations_table(N, sliding_window_width, cutoff=2.87) #obtain full correlation matrix for reference dense_matrix = get_full_matrix(correlations) #setup all kernel inputs node_degrees = dense_matrix.sum(axis=0) prefix_sums = np.cumsum(node_degrees).astype(np.int32) total_correlated_hits = np.sum(node_degrees.sum()) row_idx = np.zeros(total_correlated_hits).astype(np.int32) col_idx = np.zeros(total_correlated_hits).astype(np.int32) #call the CUDA kernel args = [row_idx, col_idx, prefix_sums, correlations, N] params = { "block_size_x": 256, 'window_width': sliding_window_width, "use_shared": 1 } answer = run_kernel("dense2sparse_kernel", kernel_string, problem_size, args, params) row_idx = answer[0] col_idx = answer[1] print("computed") print("row_idx", row_idx) print("col_idx", col_idx) #obtain Python objects for the sparse representations of both matrices answer = csr_matrix((np.ones_like(row_idx), (row_idx, col_idx)), shape=(N,N)) reference = csr_matrix(dense_matrix) print("reference") print("row_idx", reference.nonzero()[0]) print("col_idx", reference.nonzero()[1]) #subtract both sparse matrices and test #if number of non zero elements is zero, i.e. matrix is empty diff = reference - answer test_result = diff.nnz == 0 print("diff") print(diff) #verify if not test_result == True: print("test dense2sparse FAILED, attempting to create a plot for visual comparison") create_plot(answer.todense(), reference.todense()) assert test_result
def generate_large_correlations_table(N, sliding_window_width): """ generate a larget set of input data with an expected density of correlated hits This function is for testing purposes. It generates a large correlations table of size N by sliding_window_width, which is filled with zeros or ones when two hits are considered correlated. This function has no cutoff parameter but uses generate_input_data() to get input data. The correlations table is reconstructed on the GPU, for which a kernel is compiled and ran on the fly. :param N: The number of hits to be considerd by this correlation table :type N: int :param sliding_window_width: The sliding window width used for this correlation table. :type sliding_window_width: int :returns: correlations table of size N by sliding_window_width and an array storing the number of correlated hits per hit of size N. :rtype: numpy ndarray of type numpy.uint8, a numpy array of type numpy.int32 """ #generating a very large correlations table takes hours on the CPU #reconstruct input data on the GPU x,y,z,ct = generate_input_data(N) problem_size = (N,1) correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: qd_string = f.read() data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1}) correlations = data[0] sums = data[1] #now I cant compute the node degrees on the CPU anymore, so using another GPU kernel with open(get_kernel_path()+'degrees.cu', 'r') as f: degrees_string = f.read() args = [sums, correlations, N] data = run_kernel("degrees_dense", degrees_string, problem_size, args, {"block_size_x": 512}) sums = data[0] return correlations, sums
def tune_pnpoly(): #change to dir with source files because of includes in pnpoly_host.cu os.chdir(get_kernel_path()) with open('pnpoly_host.cu', 'r') as f: host_string = f.read() with open('pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, vertex_xy, size] tune_params = OrderedDict() #tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 tune_params["tile_size"] = [2**i for i in range(6)] tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy}) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string, problem_size, args, tune_params, grid_div_x=grid_div_x, answer=result, lang="C", verbose=True) return results, tune_params
def call_reference_function(size, ndim, A, B, scale_A, scale_B, cost): arguments = [cost, A, B, size, size, ndim, scale_A, scale_B] with open(get_kernel_path() + 'expdist_c.cpp', 'r') as f: kernel_string = f.read() answer = run_kernel("call_expdist", kernel_string, size, arguments, {}, lang="C", compiler_options=['-I' + get_kernel_path()]) ref_cost = answer[0][0] return ref_cost
def test_fastnoise(): with open(get_kernel_path()+'fastnoisefilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output1 = np.zeros_like(image) output2 = np.zeros_like(image) output3 = np.zeros_like(image) args = [height, width, output1, output2, image] params = OrderedDict() params["block_size_x"] = 32 params["block_size_y"] = 16 d = np.gradient(image) norm = np.sqrt( (d[0]*d[0]) + (d[1]*d[1]) ) scale = 1.0 / (1.0 + norm) dys = d[0] * scale dxs = d[1] * scale answer = run_kernel("normalized_gradient", kernel_string, problem_size, args, params) assert np.allclose(answer[2], dxs, atol=1e-6) assert np.allclose(answer[3], dys, atol=1e-6) args = [height, width, output3, dxs, dys] answer = run_kernel("gradient", kernel_string, problem_size, args, params) reference = np.gradient(dys, axis=0) + np.gradient(dxs, axis=1) assert np.allclose(answer[2], reference, atol=1e-6)
def call_reference_kernel(Nelem, r1, r2, r3, x, y, z, tar): with open('predict_model_snippet.cu', 'r') as f: kernel_string = f.read() blockDim_2 = np.int32(power_bit_length(Nelem)) args = [np.int32(Nelem), r1, r2, r3, x, y, z, tar, blockDim_2] params = {"block_size_x": int(Nelem)} reference = kernel_tuner.run_kernel( "kernel_array_beam_slave_sincos_original", kernel_string, 1, args, params, grid_div_x=[]) return reference[7]
def tune_correlate_full_kernel(kernel_name): with open(get_kernel_path()+'correlate_full.cu', 'r') as f: kernel_string = f.read() N = np.int32(1e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N, factor=1750.0) #setup kernel arguments row_idx = np.zeros(10).astype(np.int32) #not used in first kernel col_idx = np.zeros(10).astype(np.int32) #not used in first kernel prefix_sums = np.zeros(10).astype(np.int32) #not used in first kernel sums = np.zeros(N).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] #run the sums kernel once params = {"block_size_x": 256, "write_sums": 1} answer = run_kernel(kernel_name, kernel_string, problem_size, args, params) reference = [None for _ in range(len(args))] reference[3] = answer[3] sums = reference[3].astype(np.int32) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32 tune_params["write_sums"] = [1] tune_params["write_spm"] = [0] kernel_1 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True) #tune kernel #2 total_correlated_hits = sums.sum() print("total_correlated_hits", total_correlated_hits) print("density", total_correlated_hits/(float(N)*sliding_window_width)) col_idx = np.zeros(total_correlated_hits).astype(np.int32) row_idx = np.zeros(total_correlated_hits).astype(np.int32) prefix_sums = np.cumsum(sums).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] tune_params["write_sums"] = [0] tune_params["write_spm"] = [1] kernel_2 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True) return kernel_1, kernel_2
def test_fft_1024(): x = np.random.normal(size=(1024, 2)).astype(np.float32) y = np.zeros_like(x) _, y = run_kernel("fft_1024", "fft1024_mc_fma.cl", 1, [x, y], { "TESTING": 1, "block_size_x": 1024 }) y_Z = y[..., 0] + 1j * y[..., 1] y_ref = np.fft.fft(x[..., 0] + 1j * x[..., 1]) print(y_Z) print(y_ref) assert abs(y_Z - y_ref).max() < 1e-3
def call_reference_kernel(N, T, K, F, args, cp): problem_size = (T * K * F, N) params = {"block_size_x": 32, "use_kernel": 1} answer = run_kernel("kernel_tuner_host_array_beam", [get_kernel_path() + "predict_model.cu"], problem_size, args, params, lang="C", compiler_options=cp) ref = [None for _ in answer] ref[17] = answer[17] return ref
def tune(): with open('convolution.cu', 'r') as f: kernel_string = f.read() filter_size = (17, 17) problem_size = (4096, 4096) size = numpy.prod(problem_size) border_size = (filter_size[0]//2*2, filter_size[1]//2*2) input_size = ((problem_size[0]+border_size[0]) * (problem_size[1]+border_size[1])) output = numpy.zeros(size).astype(numpy.float32) input = numpy.random.randn(input_size).astype(numpy.float32) filter = numpy.random.randn(filter_size[0]*filter_size[1]).astype(numpy.float32) cmem_args= {'d_filter': filter } args = [output, input, filter] tune_params = OrderedDict() tune_params["filter_width"] = [filter_size[0]] tune_params["filter_height"] = [filter_size[1]] tune_params["block_size_x"] = [16*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(1,6)] tune_params["tile_size_x"] = [2**i for i in range(3)] tune_params["tile_size_y"] = [2**i for i in range(3)] tune_params["use_padding"] = [0,1] #toggle the insertion of padding in shared memory tune_params["read_only"] = [0,1] #toggle using the read-only cache grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] #compute the answer using a naive kernel params = { "block_size_x": 16, "block_size_y": 16} tune_params["filter_width"] = [filter_size[0]] tune_params["filter_height"] = [filter_size[1]] results = kernel_tuner.run_kernel("convolution_naive", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"], grid_div_x=["block_size_x"]) #set non-output fields to None answer = [results[0], None, None] #start kernel tuning with correctness verification return kernel_tuner.tune_kernel("convolution_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, cmem_args=cmem_args, answer=answer)
def test_parity(parity_splitting: ParitySplitting): kernel = generate_preprocessor( parity_splitting, False) + generate_parity_function(parity_splitting) x = np.arange(parity_splitting.N, dtype=np.int32) y = np.zeros_like(x) kernel_args = [x, y] results = run_kernel("test_parity_{}".format(parity_splitting.radix), kernel, parity_splitting.N, kernel_args, {}, compiler_options=["-DTESTING"]) y_ref = np.array( [parity(parity_splitting.radix, i) for i in range(parity_splitting.N)]) assert np.all(results[1] == y_ref)
def test_fft_4(): x = np.random.normal(size=(1024, 4, 2)).astype(np.float32) y = np.zeros_like(x) for cycle in range(4): y_ref = np.fft.fft(np.roll(x[..., 0] + 1j * x[..., 1], -cycle, axis=1)) _, _, y = run_kernel("test_fft_4", "fft1024_mc_fma.cl", 1, [np.int32(cycle), x, y], { "TESTING": 1, "block_size_x": 1024 }) y_Z = np.roll(y[..., 0] + 1j * y[..., 1], -cycle, axis=1) print(y_Z) print(y_ref) assert abs(y_Z - y_ref).max() < 1e-4
def test_transpose(parity_splitting: ParitySplitting): kernel = generate_preprocessor( parity_splitting, False) + generate_transpose_function(parity_splitting) x = np.arange(parity_splitting.N, dtype=np.int32) y = np.zeros_like(x) kernel_args = [x, y] results = run_kernel("test_transpose_{}".format(parity_splitting.radix), kernel, parity_splitting.N, kernel_args, {}, compiler_options=["-DTESTING"]) y_ref = x.reshape(parity_splitting.factors).T.flatten() assert np.all(results[1] == y_ref)
def test_quadratic_difference_full_sums(kernel_name, mode="qd"): skip_if_no_cuda_device() with open(get_kernel_path()+"correlate_full.cu", 'r') as f: kernel_string = f.read() N = np.int32(600) sliding_window_width = np.int32(150) problem_size = (N, 1) x,y,z,ct = generate_input_data(N, factor=18.0) correlations_ref = np.zeros((sliding_window_width, N), 'uint8') #compute reference answer if mode == "qd": correlations_ref = correlations_cpu(correlations_ref, x, y, z, ct) elif mode == "3b": ct = ct / 0.299792458 correlations_ref = correlations_cpu_3B(correlations_ref, x, y, z, ct) corr_matrix = get_full_matrix(correlations_ref) sums = np.zeros(N).astype(np.int32) row_idx = np.zeros(10).astype(np.int32) #not used in this test col_idx = np.zeros(10).astype(np.int32) #not used in this test prefix_sums = np.zeros(10).astype(np.int32) #not used in this test #call the CUDA kernel params = { "block_size_x": 256, "write_sums": 1, 'window_width': sliding_window_width, 'tile_size_x': 1 } args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] answer = run_kernel(kernel_name, kernel_string, problem_size, args, params, compiler_options=["--std=c++11"]) sums_ref = np.sum(corr_matrix, axis=0) #sums_ref = np.sum(correlations_ref, axis=0) print("reference", sums_ref.sum()) print(sums_ref) sums = answer[3] print("answer", sums.sum()) print(sums) diff = (sums_ref - sums).astype(np.int8) print("diff") print(diff) assert all(diff == 0)
def test_degrees_kernel(): skip_if_no_cuda_device() def in_degrees(correlations): degrees = np.zeros(correlations.shape[1]) for i in range(correlations.shape[1]): in_degree = 0 for j in range(correlations.shape[0]): col = i-j-1 if col>=0: in_degree += correlations[j, col] degrees[i] = in_degree return degrees with open(get_kernel_path()+'degrees.cu', 'r') as f: kernel_string = f.read() N = np.int32(400) sliding_window_width = np.int32(150) problem_size = (N, 1) #generate input data with an expected density of correlated hits correlations = generate_correlations_table(N, sliding_window_width, cutoff=2.87) #compute reference answer in_degree = in_degrees(correlations) out_degree = np.sum(correlations, axis=0).astype(np.int32) reference = (in_degree+out_degree) #call the CUDA kernel args = [out_degree, correlations, N] params = { "block_size_x": 256, 'window_width': sliding_window_width } answer = run_kernel("degrees_dense", kernel_string, problem_size, args, params) print("answer", answer[0]) print("reference", reference) #verify test_result = np.sum(answer[0] - reference) == 0 if not test_result == True: print("test degrees_dense FAILED, attempting to create a plot for visual comparison") create_plot(reference.reshape(20,20), answer[0].reshape(20,20)) assert test_result
def create_sparse_matrix(correlations, sums): """ call GPU kernel to transform a correlations table into a spare matrix This function compiles the dense2sparse GPU kernel and calls it convert a densely stored correlations table into a sparsely stored correlation matrix. The sparse notation used is CSR. This routine uses a transposed correlations table of N by window_width, whereas most other routines use window_width by N, this needs to be fixed. :param correlations: A correlations table of size N by sliding_window_width :type correlations: a 2d numpy array of type numpy.uint8 :param sums: An array with the number of correlated hits per hit :type sums: numpy array of type numpy.int32 :returns: This function returns three arrays that together form the sparse matrix * row_idx: the row index of each entry in the column index array * col_idx: the column index of each correlation in the sparse matrix * prefix_sums: the offset into the column index array for each row :rtype: numpy ndarray of type numpy.int32 """ N = np.int32(correlations.shape[0]) prefix_sums = np.cumsum(sums).astype(np.int32) total_correlated_hits = np.sum(sums.sum()) row_idx = np.zeros(total_correlated_hits).astype(np.int32) col_idx = np.zeros(total_correlated_hits).astype(np.int32) with open(get_kernel_path()+'dense2sparse.cu', 'r') as f: kernel_string = f.read() args = [row_idx, col_idx, prefix_sums, correlations, N] params = { "block_size_x": 256, "window_width": correlations.shape[1], "write_sums": 1, "use_shared": 1} data = run_kernel("dense2sparse_kernel", kernel_string, (N,1), args, params) return data[0], data[1], prefix_sums
def tune_pnpoly_kernel(): with open(get_kernel_path()+'pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, size] # (vk.x-vj.x) / (vk.y-vj.y) slopes = numpy.zeros(vertices).astype(numpy.float32) for i in range(len(slopes)): if i == 0: slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i]) else: slopes[i] = (vertex_x[i-1] - vertex_x[i]) / (vertex_y[i-1] - vertex_y[i]) cmem_args= {'d_vertices': vertex_xy, "d_slopes": slopes } tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two #tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 #tune_params["block_size_x"] = [256] #fixed size tune_params["tile_size"] = [2**i for i in range(6)] #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, args, params, cmem_args=cmem_args) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result) return results, tune_params
args_old = [correlations, N, sliding_window_width, x, y, z, ct] args = [correlations, sums, N, sliding_window_width, x, y, z, ct] tune_params = dict() tune_params["block_size_x"] = [2**i for i in range(7)] tune_params["block_size_y"] = [2**i for i in range(7)] grid_div_x = ["block_size_x"] grid_div_y = ["block_size_y"] restrict = ["block_size_x*block_size_y >= 32"] #run the kernel once for with parameters known to produce correct output #the result list can be used to verify the output of the quadratic_difference_linear kernel params = { "block_size_x": 16, "block_size_y": 16 } result = run_kernel("quadratic_difference", kernel_string, problem_size, args_old, params, grid_div_x=grid_div_x, grid_div_y=grid_div_y) #uncomment the following to tune the old kernel #tune_kernel("quadratic_difference", kernel_string, problem_size, args, tune_params, # grid_div_x=grid_div_x, grid_div_y=grid_div_y, restrictions=restrict) #now tune the quadratic_difference_linear kernel kernel_name = "quadratic_difference_full_shfl" args = [col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]
max_temp = numpy.zeros(num_blocks).astype(numpy.int32) locations = numpy.zeros(size).astype(numpy.int32) use_index = numpy.int32(1) n = numpy.int32(size) args = [max_loc, max_temp, locations, x, use_index, n] params = dict() params["block_size_x"] = 64 params["num_blocks"] = 8 params["use_shuffle"] = 1 params["vector"] = 4 #call the first kernel that computes the incomplete max locs result = run_kernel("max_loc", kernel_string, problem_size, args, params, grid_div_x=[]) #then call the kernel again on the intermediate result with 1 thread block args = [max_loc, max_temp, result[0], result[1], numpy.int32(0), num_blocks] params["num_blocks"] = 1 result_final = run_kernel("max_loc", kernel_string, (1, 1), args, params, grid_div_x=[]) print "expected", numpy.argmax(x), x.max() print "intermediate answer", result[0], result[1] print "kernel answer", result_final[0][0], result_final[1][0]
x = numpy.ones(size).astype(numpy.float32) print x prefix_sums = numpy.zeros(size).astype(numpy.float32) block_carry = numpy.zeros(max_blocks).astype(numpy.float32) n = numpy.int32(size) args = [prefix_sums, block_carry, x, n] params = dict() params["block_size_x"] = 64 #call the first kernel that computes the incomplete prefix sums #and outputs the block carry values result = run_kernel("prefix_sum_block", kernel_string, problem_size, args, params, grid_div_x=["block_size_x"]) prefix_sums = result[0] print result[0] print result[1] block_filler = numpy.zeros(max_blocks).astype(numpy.float32) block_out = numpy.zeros(max_blocks).astype(numpy.float32) args = [block_out, block_filler, result[1], numpy.int32(max_blocks)] #call the kernel again, but this time on the block carry values #one thread block should be sufficient if max_blocks > params["block_size_x"]: print("warning: block size too small")
def test_pnpoly_kernel(): skip_if_no_cuda_device() with open(get_kernel_path()+'pnpoly.cu', 'r') as f: kernel_string = f.read() problem_size = (int(2e6), 1) size = numpy.int32(numpy.prod(problem_size)) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #to verify the output of the gpu kernel #we use a circle with radius 1 as polygon and #do a simple distance to 0,0 check for all points vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] print "points_x min max", points_x.min(), points_x.max() print "points_y min max", points_y.min(), points_y.max() vertex_x = numpy.cos(vertex_seeds) vertex_x[-1] = vertex_x[0] vertex_y = numpy.sin(vertex_seeds) vertex_y[-1] = vertex_y[0] vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, size] print "vertex_x min max", vertex_x.min(), vertex_x.max() print "vertex_y min max", vertex_y.min(), vertex_y.max() #from matplotlib import pyplot #plot all points #pyplot.scatter(points_x, points_y) #plot the outline of the polygon #pyplot.plot(vertex_x, vertex_y) #pyplot.show() cmem_args= {'d_vertices': vertex_xy } params = dict() params["block_size_x"] = 64 params["tile_size"] = 1 params["between_method"] = 2 params["use_method"] = 0 kernel_name = "cn_pnpoly" #compute kernel output result = kernel_tuner.run_kernel(kernel_name, kernel_string, problem_size, args, params, cmem_args=cmem_args) answer = result[0] answer_sum = numpy.sum(answer) print("answer sum=", answer_sum) print(result[0]) #compute reference answer using naive kernel reference = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, args, params, cmem_args=cmem_args) reference = reference[0] #reference = [numpy.sqrt(x*x + y*y) < 1.0 for x,y in zip(points_x, points_y)] #reference = numpy.array(reference).astype(numpy.int32) reference_sum = numpy.sum(reference) print("reference sum =", reference_sum) print(reference) diff = answer - reference print("diff abs sum=", numpy.sum(numpy.absolute(diff)) ) for i in range(len(diff)): if diff[i] != 0: x = points[i*2] y = points[i*2+1] print("diff=",diff[i],"error on point i=", i, "(x,y)=", (x,y), "dist to 0,0=", numpy.sqrt(x*x+y*y) ) if y in vertex_y: print ("y equals y-coordinate of a vertex") #we assert with a small margin because the test #and the kernel compute different things assert numpy.sum(numpy.absolute(answer - reference)) < 5
#!/usr/bin/env python import numpy import kernel_tuner problem_size = (4096, 4096) size = numpy.prod(problem_size) A = numpy.random.randn(size).astype(numpy.float32) B = numpy.random.randn(size).astype(numpy.float32) C = numpy.zeros_like(A) args = [C, A, B] params = {"block_size_x": 32, "block_size_y": 8, "tile_size_x": 4, "tile_size_y": 4} grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] results = kernel_tuner.run_kernel("matmul_kernel", "../examples/cuda/matmul.cu", problem_size, args, params, grid_div_x=grid_div_x, grid_div_y=grid_div_y)
grid_div_x = ["block_size_x", "tile_size"] params = dict() params["block_size_x"] = 512 params["prefetch"] = 0 #params["use_bitmap"] = 0 #params["coalesce_bitmap"] = 0 params["tile_size"] = 1 #kernel_name = "cn_PnPoly" kernel_name = "cn_PnPoly_naive" #kernel_name = "pnpoly_cn_gpu" result = kernel_tuner.run_kernel(kernel_name, kernel_string, problem_size, args, params, grid_div_x=grid_div_x, cmem_args=cmem_args) result = [result[0], None, None] #result = kernel_tuner.run_kernel("pnpoly_cn", kernel_string, # problem_size, args, params, # grid_div_x=grid_div_x, lang="C") print "sum=" + str(numpy.sum(result[0])) params["prefetch"] = 1 res = kernel_tuner.run_kernel("cn_PnPoly", kernel_string, problem_size, args, params, grid_div_x=grid_div_x, cmem_args=cmem_args)
#!/usr/bin/env python import numpy import kernel_tuner problem_size = (4096, 4096) size = numpy.prod(problem_size) A = numpy.random.randn(*problem_size).astype(numpy.float32) B = numpy.random.randn(*problem_size).astype(numpy.float32) C = numpy.zeros_like(A) args = [C, A, B] answer = [numpy.dot(A,B), None, None] params = {"block_size_x": 16, "block_size_y": 32} results = kernel_tuner.run_kernel("matmul_kernel", "matmul_naive.cu", problem_size, args, params) # answer = run_kernel("matmul_kernel", [get_kernel_path()+"matmul_naive.cu"], problem_size, args, params, lang="C", compiler_options=cp)
prefix_sum = np.cumsum(sum_rows).astype(np.int32) print prefix_sum.shape print prefix_sum print "bliep", num_correlated_hits, np.sum(sum_rows) row_idx = np.zeros(num_correlated_hits).astype(np.int32) col_idx = np.zeros(num_correlated_hits).astype(np.int32) args = [row_idx, col_idx, prefix_sum, correlations, N] params = dict() params["block_size_x"] = 256 result = run_kernel("dense2sparse_kernel", kernel_string, problem_size, args, params, grid_div_x=["block_size_x"]) row_idx = result[0] col_idx = result[1] correlations_restored = np.zeros((sliding_window_width, N), 'uint8') correlations_restored[col_idx, row_idx] = 1 #for i in range(num_correlated_hits): # correlations_restored[col_idx[i], row_idx[i]] = 1 print "restored_hits", np.sum(correlations_restored) if False: from matplotlib import pyplot
def test_quadratic_difference_full_sparse_matrix(kernel_name, mode): skip_if_no_cuda_device() with open(get_kernel_path()+"correlate_full.cu", 'r') as f: kernel_string = f.read() #N,x,y,z,ct = get_real_input_data("/var/scratch/bwn200/KM3Net/event1-crop.txt") N = np.int32(600) sliding_window_width = np.int32(150) problem_size = (N, 1) x,y,z,ct = generate_input_data(N, factor=18.0) #compute reference answer correlations_ref = np.zeros((sliding_window_width, N), 'uint8') if mode == "qd": correlations_ref = correlations_cpu(correlations_ref, x, y, z, ct) elif mode == "3b": ct = ct / 0.299792458 correlations_ref = correlations_cpu_3B(correlations_ref, x, y, z, ct) corr_matrix = get_full_matrix(correlations_ref) sums_ref = np.sum(corr_matrix, axis=1) total_correlated_hits = corr_matrix.sum() sums = sums_ref.astype(np.int32) row_idx = np.zeros(total_correlated_hits).astype(np.int32) col_idx = np.zeros(total_correlated_hits).astype(np.int32) prefix_sums = np.cumsum(sums_ref).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] #call the CUDA kernel params = { "block_size_x": 256, "write_spm": 1, 'write_rows': 1, 'window_width': sliding_window_width, 'tile_size_x': 1 } answer = run_kernel(kernel_name, kernel_string, problem_size, args, params) reference = csr_matrix(corr_matrix) col_idx_ref = reference.nonzero()[1] row_idx = answer[0] print("row_idx") print(row_idx) col_idx = answer[1] print("col_idx") print(col_idx) print("reference") print(list(zip(reference.nonzero()[0], reference.nonzero()[1]))) answer = csr_matrix((np.ones_like(row_idx), (row_idx, col_idx)), shape=(N,N)) print("answer") print(list(zip(answer.nonzero()[0], answer.nonzero()[1]))) diff = reference - answer print("diff") print(list(zip(diff.nonzero()[0], diff.nonzero()[1]))) print("diff.nnz", diff.nnz) answer2 = csr_matrix(sparse_to_dense(prefix_sums, col_idx), shape=(N,N)) diff2 = reference - answer2 print("diff2") print(list(zip(diff2.nonzero()[0], diff2.nonzero()[1]))) print("diff2.nnz", diff2.nnz) if False: create_plot(get_full_matrix(reference), get_full_matrix(answer)) assert diff.nnz == 0 assert diff2.nnz == 0
def test_pnpoly_naive_kernel(): skip_if_no_cuda_device() with open(get_kernel_path()+'pnpoly.cu', 'r') as f: kernel_string = f.read() problem_size = (20000, 1) size = numpy.int32(numpy.prod(problem_size)) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #to verify the output of the gpu kernel #we use a circle with radius 1 as polygon and #do a simple distance to 0,0 check for all points vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] print "points_x min max", points_x.min(), points_x.max() print "points_y min max", points_y.min(), points_y.max() vertex_x = numpy.cos(vertex_seeds) vertex_x[-1] = vertex_x[0] vertex_y = numpy.sin(vertex_seeds) vertex_y[-1] = vertex_y[0] vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, size] print "vertex_x min max", vertex_x.min(), vertex_x.max() print "vertex_y min max", vertex_y.min(), vertex_y.max() #from matplotlib import pyplot #plot all points #pyplot.scatter(points_x, points_y) #plot the outline of the polygon #pyplot.plot(vertex_x, vertex_y) #pyplot.show() cmem_args= {'d_vertices': vertex_xy } params = dict() params["block_size_x"] = 512 kernel_name = "cn_pnpoly_naive" #compute kernel output result = kernel_tuner.run_kernel(kernel_name, kernel_string, problem_size, args, params, cmem_args=cmem_args) answer = result[0] answer_sum = numpy.sum(answer) print("answer sum=", answer_sum) print(result[0]) #compute reference answer reference = [numpy.sqrt(x*x + y*y) < 1.0 for x,y in zip(points_x, points_y)] reference = numpy.array(reference).astype(numpy.int32) reference_sum = numpy.sum(reference) print("reference sum =", reference_sum) print(reference) #we assert with a small margin because the test #and the kernel compute different things assert numpy.sum(numpy.absolute(answer - reference)) < 5