def tune_degrees_dense(): with open(get_kernel_path()+'degrees.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N) problem_size = (N,1) correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: qd_string = f.read() data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1}) correlations = data[0] sums = data[1] #partial sum of the # of correlated hits to hits later in time #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["window_width"] = [sliding_window_width] args = [sums, correlations, N] return tune_kernel("degrees_dense", kernel_string, problem_size, args, tune_params, verbose=True)
def tune_dense2sparse(): with open(get_kernel_path()+'dense2sparse.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input correlations, sums = generate_large_correlations_table(N, sliding_window_width) #setup all kernel inputs prefix_sums = np.cumsum(sums).astype(np.int32) total_correlated_hits = np.sum(sums.sum()) row_idx = np.zeros(total_correlated_hits).astype(np.int32) col_idx = np.zeros(total_correlated_hits).astype(np.int32) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #factors of 32 up to 1024 tune_params["window_width"] = [sliding_window_width] tune_params["use_shared"] = [0, 1] tune_params["f_unroll"] = [i for i in range(1,5) if 1500/float(i) == 1500//i] #divisors of 1500 #call the tuner args = [row_idx, col_idx, prefix_sums, correlations, N] return tune_kernel("dense2sparse_kernel", kernel_string, problem_size, args, tune_params, verbose=True)
def tune_quadratic_difference_kernel(): with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N) #setup kernel arguments correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32 tune_params["f_unroll"] = [i for i in range(1,20) if 1500/float(i) == 1500//i] #divisors of 1500 tune_params["tile_size_x"] = [2**i for i in range(5)] #powers of 2 tune_params["write_sums"] = [1] return tune_kernel("quadratic_difference_linear", kernel_string, problem_size, args, tune_params, verbose=True)
def test_wiener(): with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output = np.zeros(problem_size, dtype=np.float32) args = [height, width, output, image] params = OrderedDict() params["block_size_x"] = 32 params["block_size_y"] = 8 params["reuse_computation"] = 1 answer = run_kernel("computeVarianceEstimates", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"]) reference = run_kernel("computeVarianceEstimates_naive", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"]) assert np.allclose(answer[2], reference[2], atol=1e-6)
def tune_wiener(): with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output = np.zeros(problem_size, dtype=np.float32) args = [height, width, output, image] tune_params = OrderedDict() tune_params["block_size_x"] = [32 * i for i in range(1, 33)] tune_params["block_size_y"] = [2**i for i in range(6)] #first the naive kernel #tune_kernel("computeVarianceEstimates_naive", kernel_string, problem_size, args, tune_params, grid_div_y=["block_size_y"]) #more sophisticated kernel tune_params["reuse_computation"] = [0, 1] tune_kernel("computeVarianceEstimates", kernel_string, problem_size, args, tune_params, grid_div_y=["block_size_y"])
def tune_fastnoise(): with open(get_kernel_path() + 'fastnoisefilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output = np.zeros(problem_size, dtype=np.float32) args = [height, width, output, image] tune_params = OrderedDict() tune_params["block_size_x"] = [32 * i for i in range(1, 33)] tune_params["block_size_y"] = [2**i for i in range(6)] kernels = [ "normalized_gradient", "gradient", "convolveHorizontally", "convolveVertically", "normalize" ] for k in kernels: tune_kernel(k, kernel_string, problem_size, args, tune_params)
def tune_minimum_degree(): with open(get_kernel_path()+'minimum_degree.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #tune params here tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["threshold"] = [3] max_blocks = int(np.ceil(N / float(max(tune_params["block_size_x"])))) #generate input data with an expected density of correlated hits correlations, sums = generate_large_correlations_table(N, sliding_window_width) row_idx, col_idx, prefix_sums = create_sparse_matrix(correlations, sums) #setup all kernel inputs minimum = np.zeros(max_blocks).astype(np.int32) num_nodes = np.zeros(max_blocks).astype(np.int32) #call the CUDA kernel args = [minimum, num_nodes, sums, row_idx, col_idx, prefix_sums, N] return tune_kernel("minimum_degree", kernel_string, problem_size, args, tune_params, verbose=True)
def tune_variance_zero_mean(): with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) size = np.int32(height * width) tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5, 11)] tune_params["num_blocks"] = [2**i for i in range(5, 11)] max_blocks = max(tune_params["num_blocks"]) output = np.zeros(max_blocks, dtype=np.float32) args = [size, output, image] problem_size = ("num_blocks", 1) tune_kernel("computeVarianceZeroMean", kernel_string, problem_size, args, tune_params, grid_div_x=[], verbose=True)
def test_find_peak(): with open(get_kernel_path() + 'peaktocorrelationenergy.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test_small.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) #generate some bogus crosscorr data crosscorr = np.random.randn(height, width, 2).astype(np.float32) #compute reference in Python peak_index = np.argmax(np.absolute(crosscorr[:, :, 0])) peak_value = np.absolute(crosscorr[:, :, 0].flatten()[peak_index]) params = {"block_size_x": 512, "num_blocks": 64} problem_size = ("num_blocks", 1) num_blocks = np.int32(params["num_blocks"]) peakval = np.zeros((1), dtype=np.float32) peakvals = np.zeros((num_blocks), dtype=np.float32) peakindx = np.zeros((num_blocks), dtype=np.int32) loc = np.zeros((1), dtype=np.int32) val = np.zeros((1), dtype=np.float32) args = [height, width, peakval, peakvals, peakindx, crosscorr] output1 = run_kernel("findPeak", kernel_string, problem_size, args, params, grid_div_x=[]) peakvals = output1[3] peakindx = output1[4] args = [loc, val, peakindx, peakvals, num_blocks] output2 = run_kernel("maxlocFloats", kernel_string, (1, 1), args, params, grid_div_x=[]) loc = output2[0][0] val = output2[1][0] print("answer") print("loc=", loc, "val=", val) print("reference") print("loc=", peak_index, "val=", peak_value) assert loc == peak_index assert np.isclose(val, peak_value, atol=1e-6)
def tune_pnpoly(): #change to dir with source files because of includes in pnpoly_host.cu os.chdir(get_kernel_path()) with open('pnpoly_host.cu', 'r') as f: host_string = f.read() with open('pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, vertex_xy, size] tune_params = OrderedDict() #tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 tune_params["tile_size"] = [2**i for i in range(6)] tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy}) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string, problem_size, args, tune_params, grid_div_x=grid_div_x, answer=result, lang="C", verbose=True) return results, tune_params
def tune_zeromean(): with open(get_kernel_path() + 'zeromeantotalfilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) tune_vertical(kernel_string, image, height, width) tune_horizontal(kernel_string, image, height, width) tune_transpose(kernel_string, image, height, width)
def tune_correlate_full_kernel(kernel_name): with open(get_kernel_path()+'correlate_full.cu', 'r') as f: kernel_string = f.read() N = np.int32(1e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N, factor=1750.0) #setup kernel arguments row_idx = np.zeros(10).astype(np.int32) #not used in first kernel col_idx = np.zeros(10).astype(np.int32) #not used in first kernel prefix_sums = np.zeros(10).astype(np.int32) #not used in first kernel sums = np.zeros(N).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] #run the sums kernel once params = {"block_size_x": 256, "write_sums": 1} answer = run_kernel(kernel_name, kernel_string, problem_size, args, params) reference = [None for _ in range(len(args))] reference[3] = answer[3] sums = reference[3].astype(np.int32) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32 tune_params["write_sums"] = [1] tune_params["write_spm"] = [0] kernel_1 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True) #tune kernel #2 total_correlated_hits = sums.sum() print("total_correlated_hits", total_correlated_hits) print("density", total_correlated_hits/(float(N)*sliding_window_width)) col_idx = np.zeros(total_correlated_hits).astype(np.int32) row_idx = np.zeros(total_correlated_hits).astype(np.int32) prefix_sums = np.cumsum(sums).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] tune_params["write_sums"] = [0] tune_params["write_spm"] = [1] kernel_2 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True) return kernel_1, kernel_2
def test_fastnoise(): with open(get_kernel_path()+'fastnoisefilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output1 = np.zeros_like(image) output2 = np.zeros_like(image) output3 = np.zeros_like(image) args = [height, width, output1, output2, image] params = OrderedDict() params["block_size_x"] = 32 params["block_size_y"] = 16 d = np.gradient(image) norm = np.sqrt( (d[0]*d[0]) + (d[1]*d[1]) ) scale = 1.0 / (1.0 + norm) dys = d[0] * scale dxs = d[1] * scale answer = run_kernel("normalized_gradient", kernel_string, problem_size, args, params) assert np.allclose(answer[2], dxs, atol=1e-6) assert np.allclose(answer[3], dys, atol=1e-6) args = [height, width, output3, dxs, dys] answer = run_kernel("gradient", kernel_string, problem_size, args, params) reference = np.gradient(dys, axis=0) + np.gradient(dxs, axis=1) assert np.allclose(answer[2], reference, atol=1e-6)
def test_complex_and_flip2(): with open(get_kernel_path() + 'peaktocorrelationenergy.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test_small.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) problem_size = (width, height) output = np.zeros((height, width, 2), dtype=np.float32) args = [height, width, output, output, image, image] params = OrderedDict() params["block_size_x"] = 32 params["block_size_y"] = 16 answer = run_kernel("toComplexAndFlip2", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"], grid_div_x=["block_size_x"]) output1 = answer[2].reshape(height, width, 2) output1 = output1[:, :, 0] + 1j * output[:, :, 1] reference1 = image + 1j * np.zeros((height, width), dtype=np.float32) assert np.allclose(output1, reference1, atol=1e-6) reference2 = image.flatten()[::-1].reshape(height, width) reference2 = reference2 output2 = answer[3].reshape(height, width, 2) assert np.allclose(output2[:, :, 0], reference2, atol=1e-6) assert np.allclose(output2[:, :, 1], np.zeros((height, width), dtype=np.float32), atol=1e-6)
def tune_prefix_sum_kernel(): with open(get_kernel_path()+'prefixsum.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) problem_size = (N, 1) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] max_blocks = np.ceil(N/float(max(tune_params["block_size_x"]))).astype(np.int32) x = np.ones(N).astype(np.int32) #setup kernel arguments prefix_sums = np.zeros(N).astype(np.int32) block_carry = np.zeros(max_blocks).astype(np.int32) args = [prefix_sums, block_carry, x, N] #tune only the first kernel that computes the thread block-wide prefix sums #and outputs the block carry values return tune_kernel("prefix_sum_block", kernel_string, problem_size, args, tune_params, verbose=True)
def test_variance_zero_mean(): with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "test.jpg", mode="F") height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) size = np.int32(height * width) params = OrderedDict() params["block_size_x"] = 512 params["num_blocks"] = 64 num_blocks = params["num_blocks"] output = np.zeros(num_blocks, dtype=np.float32) args = [size, output, image] problem_size = ("num_blocks", 1) answer = run_kernel("computeVarianceZeroMean", kernel_string, problem_size, args, params, grid_div_x=[]) print("answer:") ans = np.sum(answer[1]) print(ans, answer[1]) print("reference:") reference = np.sum(image * image) print(reference) assert np.isclose(ans, reference, atol=1e-6)
def tune_pce(): with open(get_kernel_path()+'peaktocorrelationenergy.cu', 'r') as f: kernel_string = f.read() image = imread(get_testdata_path() + "Pentax_OptioA40_0_30731.JPG", mode="F") image = fastnoise(image) image2 = imread(get_testdata_path() + "Pentax_OptioA40_0_30757.JPG", mode="F") image2 = fastnoise(image2) height = np.int32(image.shape[0]) width = np.int32(image.shape[1]) image_freq, image2_freq = tune_complex_and_flip(kernel_string, height, width, image, image2) crosscorr = tune_crosscorr(kernel_string, height, width, image_freq, image2_freq) loc, val = tune_find_peak(kernel_string, height, width, crosscorr) energy = tune_energy(kernel_string, height, width, crosscorr, loc) pce_score = (val[0] * val[0]) / energy print("Finished tuning PCE, pce_score=", pce_score)
def tune_pnpoly(): #change to dir with source files because of includes in pnpoly_host.cu os.chdir(get_kernel_path()) with open('pnpoly_host.cu', 'r') as f: host_string = f.read() with open('pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2 * size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices) * 2.0 * numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array(zip(vertex_x, vertex_y)).astype(numpy.float32) args = [bitmap, points, vertex_xy, size] tune_params = OrderedDict() #tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two tune_params["block_size_x"] = [32 * i for i in range(1, 32)] #multiple of 32 tune_params["tile_size"] = [2**i for i in range(6)] tune_params["f_unroll"] = [ i for i in range(1, 20) if float(vertices) / i == vertices // i ] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy}) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string, problem_size, args, tune_params, grid_div_x=grid_div_x, answer=result, lang="C", verbose=True) return results, tune_params
def tune_pnpoly_kernel(): with open(get_kernel_path() + 'pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2 * size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices) * 2.0 * numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array(zip(vertex_x, vertex_y)).astype(numpy.float32) args = [bitmap, points, size] # (vk.x-vj.x) / (vk.y-vj.y) slopes = numpy.zeros(vertices).astype(numpy.float32) for i in range(len(slopes)): if i == 0: slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i]) else: slopes[i] = (vertex_x[i - 1] - vertex_x[i]) / (vertex_y[i - 1] - vertex_y[i]) cmem_args = {'d_vertices': vertex_xy, "d_slopes": slopes} tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(6, 10)] #powers of two #tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 #tune_params["block_size_x"] = [256] #fixed size tune_params["tile_size"] = [2**i for i in range(6)] #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, args, params, cmem_args=cmem_args) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result) return results, tune_params
def tune_pnpoly_kernel(): with open(get_kernel_path()+'pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, size] # (vk.x-vj.x) / (vk.y-vj.y) slopes = numpy.zeros(vertices).astype(numpy.float32) for i in range(len(slopes)): if i == 0: slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i]) else: slopes[i] = (vertex_x[i-1] - vertex_x[i]) / (vertex_y[i-1] - vertex_y[i]) cmem_args= {'d_vertices': vertex_xy, "d_slopes": slopes } tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two #tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 #tune_params["block_size_x"] = [256] #fixed size tune_params["tile_size"] = [2**i for i in range(6)] #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, args, params, cmem_args=cmem_args) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result) return results, tune_params