示例#1
0
def tune_degrees_dense():

    with open(get_kernel_path()+'degrees.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N)
    problem_size = (N,1)
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]
    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        qd_string = f.read()
    data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1})
    correlations = data[0]
    sums = data[1]  #partial sum of the # of correlated hits to hits later in time

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["window_width"] = [sliding_window_width]

    args = [sums, correlations, N]
    return tune_kernel("degrees_dense", kernel_string, problem_size, args, tune_params, verbose=True)
示例#2
0
def tune_dense2sparse():

    with open(get_kernel_path()+'dense2sparse.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input
    correlations, sums = generate_large_correlations_table(N, sliding_window_width)

    #setup all kernel inputs
    prefix_sums = np.cumsum(sums).astype(np.int32)
    total_correlated_hits = np.sum(sums.sum())
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    col_idx = np.zeros(total_correlated_hits).astype(np.int32)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #factors of 32 up to 1024
    tune_params["window_width"] = [sliding_window_width]
    tune_params["use_shared"] = [0, 1]
    tune_params["f_unroll"] = [i for i in range(1,5) if 1500/float(i) == 1500//i] #divisors of 1500

    #call the tuner
    args = [row_idx, col_idx, prefix_sums, correlations, N]
    return tune_kernel("dense2sparse_kernel", kernel_string, problem_size, args, tune_params, verbose=True)
示例#3
0
def tune_quadratic_difference_kernel():

    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N)

    #setup kernel arguments
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32
    tune_params["f_unroll"] = [i for i in range(1,20) if 1500/float(i) == 1500//i] #divisors of 1500
    tune_params["tile_size_x"] = [2**i for i in range(5)] #powers of 2
    tune_params["write_sums"] = [1]

    return tune_kernel("quadratic_difference_linear", kernel_string, problem_size, args, tune_params, verbose=True)
示例#4
0
def test_wiener():

    with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output = np.zeros(problem_size, dtype=np.float32)

    args = [height, width, output, image]

    params = OrderedDict()
    params["block_size_x"] = 32
    params["block_size_y"] = 8
    params["reuse_computation"] = 1

    answer = run_kernel("computeVarianceEstimates",
                        kernel_string,
                        problem_size,
                        args,
                        params,
                        grid_div_y=["block_size_y"])

    reference = run_kernel("computeVarianceEstimates_naive",
                           kernel_string,
                           problem_size,
                           args,
                           params,
                           grid_div_y=["block_size_y"])

    assert np.allclose(answer[2], reference[2], atol=1e-6)
示例#5
0
def tune_wiener():

    with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output = np.zeros(problem_size, dtype=np.float32)

    args = [height, width, output, image]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32 * i for i in range(1, 33)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    #first the naive kernel
    #tune_kernel("computeVarianceEstimates_naive", kernel_string, problem_size, args, tune_params, grid_div_y=["block_size_y"])

    #more sophisticated kernel
    tune_params["reuse_computation"] = [0, 1]
    tune_kernel("computeVarianceEstimates",
                kernel_string,
                problem_size,
                args,
                tune_params,
                grid_div_y=["block_size_y"])
示例#6
0
def tune_fastnoise():

    with open(get_kernel_path() + 'fastnoisefilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output = np.zeros(problem_size, dtype=np.float32)

    args = [height, width, output, image]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32 * i for i in range(1, 33)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    kernels = [
        "normalized_gradient", "gradient", "convolveHorizontally",
        "convolveVertically", "normalize"
    ]
    for k in kernels:
        tune_kernel(k, kernel_string, problem_size, args, tune_params)
示例#7
0
def tune_minimum_degree():

    with open(get_kernel_path()+'minimum_degree.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #tune params here
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["threshold"] = [3]

    max_blocks = int(np.ceil(N / float(max(tune_params["block_size_x"]))))

    #generate input data with an expected density of correlated hits
    correlations, sums = generate_large_correlations_table(N, sliding_window_width)
    row_idx, col_idx, prefix_sums = create_sparse_matrix(correlations, sums)

    #setup all kernel inputs
    minimum = np.zeros(max_blocks).astype(np.int32)
    num_nodes = np.zeros(max_blocks).astype(np.int32)

    #call the CUDA kernel
    args = [minimum, num_nodes, sums, row_idx, col_idx, prefix_sums, N]
    return tune_kernel("minimum_degree", kernel_string, problem_size, args, tune_params, verbose=True)
示例#8
0
def tune_variance_zero_mean():

    with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    size = np.int32(height * width)

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5, 11)]
    tune_params["num_blocks"] = [2**i for i in range(5, 11)]

    max_blocks = max(tune_params["num_blocks"])
    output = np.zeros(max_blocks, dtype=np.float32)

    args = [size, output, image]
    problem_size = ("num_blocks", 1)

    tune_kernel("computeVarianceZeroMean",
                kernel_string,
                problem_size,
                args,
                tune_params,
                grid_div_x=[],
                verbose=True)
示例#9
0
def test_find_peak():

    with open(get_kernel_path() + 'peaktocorrelationenergy.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test_small.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    #generate some bogus crosscorr data
    crosscorr = np.random.randn(height, width, 2).astype(np.float32)

    #compute reference in Python
    peak_index = np.argmax(np.absolute(crosscorr[:, :, 0]))
    peak_value = np.absolute(crosscorr[:, :, 0].flatten()[peak_index])

    params = {"block_size_x": 512, "num_blocks": 64}
    problem_size = ("num_blocks", 1)
    num_blocks = np.int32(params["num_blocks"])

    peakval = np.zeros((1), dtype=np.float32)
    peakvals = np.zeros((num_blocks), dtype=np.float32)
    peakindx = np.zeros((num_blocks), dtype=np.int32)
    loc = np.zeros((1), dtype=np.int32)
    val = np.zeros((1), dtype=np.float32)

    args = [height, width, peakval, peakvals, peakindx, crosscorr]
    output1 = run_kernel("findPeak",
                         kernel_string,
                         problem_size,
                         args,
                         params,
                         grid_div_x=[])

    peakvals = output1[3]
    peakindx = output1[4]

    args = [loc, val, peakindx, peakvals, num_blocks]
    output2 = run_kernel("maxlocFloats",
                         kernel_string, (1, 1),
                         args,
                         params,
                         grid_div_x=[])

    loc = output2[0][0]
    val = output2[1][0]

    print("answer")
    print("loc=", loc, "val=", val)

    print("reference")
    print("loc=", peak_index, "val=", peak_value)

    assert loc == peak_index
    assert np.isclose(val, peak_value, atol=1e-6)
def tune_pnpoly():

    #change to dir with source files because of includes in pnpoly_host.cu
    os.chdir(get_kernel_path())

    with open('pnpoly_host.cu', 'r') as f:
        host_string = f.read()
    with open('pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0
    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, vertex_xy, size]

    tune_params = OrderedDict()

    #tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32

    tune_params["tile_size"] = [2**i for i in range(6)]
    tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy})
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, answer=result, lang="C", verbose=True)

    return results, tune_params
示例#11
0
def tune_zeromean():

    with open(get_kernel_path() + 'zeromeantotalfilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")
    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])

    tune_vertical(kernel_string, image, height, width)
    tune_horizontal(kernel_string, image, height, width)
    tune_transpose(kernel_string, image, height, width)
示例#12
0
def tune_correlate_full_kernel(kernel_name):

    with open(get_kernel_path()+'correlate_full.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(1e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N, factor=1750.0)

    #setup kernel arguments
    row_idx = np.zeros(10).astype(np.int32)         #not used in first kernel
    col_idx = np.zeros(10).astype(np.int32)         #not used in first kernel
    prefix_sums = np.zeros(10).astype(np.int32)     #not used in first kernel
    sums = np.zeros(N).astype(np.int32)
    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    #run the sums kernel once
    params = {"block_size_x": 256, "write_sums": 1}
    answer = run_kernel(kernel_name, kernel_string, problem_size, args, params)
    reference = [None for _ in range(len(args))]
    reference[3] = answer[3]
    sums = reference[3].astype(np.int32)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32
    tune_params["write_sums"] = [1]
    tune_params["write_spm"] = [0]

    kernel_1 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True)

    #tune kernel #2
    total_correlated_hits = sums.sum()
    print("total_correlated_hits", total_correlated_hits)
    print("density", total_correlated_hits/(float(N)*sliding_window_width))

    col_idx = np.zeros(total_correlated_hits).astype(np.int32)
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    prefix_sums = np.cumsum(sums).astype(np.int32)
    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    tune_params["write_sums"] = [0]
    tune_params["write_spm"] = [1]

    kernel_2 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True)

    return kernel_1, kernel_2
示例#13
0
def test_fastnoise():

    with open(get_kernel_path()+'fastnoisefilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output1 = np.zeros_like(image)
    output2 = np.zeros_like(image)
    output3 = np.zeros_like(image)

    args = [height, width, output1, output2, image]

    params = OrderedDict()
    params["block_size_x"] = 32
    params["block_size_y"] = 16

    d = np.gradient(image)
    norm = np.sqrt( (d[0]*d[0]) + (d[1]*d[1]) )
    scale = 1.0 / (1.0 + norm)
    dys = d[0] * scale
    dxs = d[1] * scale

    answer = run_kernel("normalized_gradient",
        kernel_string, problem_size, args, params)

    assert np.allclose(answer[2], dxs, atol=1e-6)
    assert np.allclose(answer[3], dys, atol=1e-6)

    args = [height, width, output3, dxs, dys]
    answer = run_kernel("gradient",
        kernel_string, problem_size, args, params)

    reference = np.gradient(dys, axis=0) + np.gradient(dxs, axis=1)

    assert np.allclose(answer[2], reference, atol=1e-6)
示例#14
0
def test_complex_and_flip2():

    with open(get_kernel_path() + 'peaktocorrelationenergy.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test_small.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output = np.zeros((height, width, 2), dtype=np.float32)

    args = [height, width, output, output, image, image]

    params = OrderedDict()
    params["block_size_x"] = 32
    params["block_size_y"] = 16

    answer = run_kernel("toComplexAndFlip2",
                        kernel_string,
                        problem_size,
                        args,
                        params,
                        grid_div_y=["block_size_y"],
                        grid_div_x=["block_size_x"])

    output1 = answer[2].reshape(height, width, 2)
    output1 = output1[:, :, 0] + 1j * output[:, :, 1]
    reference1 = image + 1j * np.zeros((height, width), dtype=np.float32)
    assert np.allclose(output1, reference1, atol=1e-6)

    reference2 = image.flatten()[::-1].reshape(height, width)
    reference2 = reference2
    output2 = answer[3].reshape(height, width, 2)
    assert np.allclose(output2[:, :, 0], reference2, atol=1e-6)
    assert np.allclose(output2[:, :, 1],
                       np.zeros((height, width), dtype=np.float32),
                       atol=1e-6)
示例#15
0
def tune_prefix_sum_kernel():

    with open(get_kernel_path()+'prefixsum.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    problem_size = (N, 1)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)]

    max_blocks = np.ceil(N/float(max(tune_params["block_size_x"]))).astype(np.int32)
    x = np.ones(N).astype(np.int32)

    #setup kernel arguments
    prefix_sums = np.zeros(N).astype(np.int32)
    block_carry = np.zeros(max_blocks).astype(np.int32)
    args = [prefix_sums, block_carry, x, N]

    #tune only the first kernel that computes the thread block-wide prefix sums
    #and outputs the block carry values
    return tune_kernel("prefix_sum_block", kernel_string, problem_size, args, tune_params, verbose=True)
示例#16
0
def test_variance_zero_mean():

    with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    size = np.int32(height * width)

    params = OrderedDict()
    params["block_size_x"] = 512
    params["num_blocks"] = 64

    num_blocks = params["num_blocks"]
    output = np.zeros(num_blocks, dtype=np.float32)

    args = [size, output, image]
    problem_size = ("num_blocks", 1)

    answer = run_kernel("computeVarianceZeroMean",
                        kernel_string,
                        problem_size,
                        args,
                        params,
                        grid_div_x=[])

    print("answer:")
    ans = np.sum(answer[1])
    print(ans, answer[1])
    print("reference:")
    reference = np.sum(image * image)
    print(reference)

    assert np.isclose(ans, reference, atol=1e-6)
示例#17
0
def tune_pce():

    with open(get_kernel_path()+'peaktocorrelationenergy.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "Pentax_OptioA40_0_30731.JPG", mode="F")
    image = fastnoise(image)

    image2 = imread(get_testdata_path() + "Pentax_OptioA40_0_30757.JPG", mode="F")
    image2 = fastnoise(image2)

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])

    image_freq, image2_freq = tune_complex_and_flip(kernel_string, height, width, image, image2)

    crosscorr = tune_crosscorr(kernel_string, height, width, image_freq, image2_freq)

    loc, val = tune_find_peak(kernel_string, height, width, crosscorr)

    energy = tune_energy(kernel_string, height, width, crosscorr, loc)

    pce_score = (val[0] * val[0]) / energy
    print("Finished tuning PCE, pce_score=", pce_score)
示例#18
0
def tune_pnpoly():

    #change to dir with source files because of includes in pnpoly_host.cu
    os.chdir(get_kernel_path())

    with open('pnpoly_host.cu', 'r') as f:
        host_string = f.read()
    with open('pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2 * size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0
    vertex_seeds = numpy.sort(numpy.random.rand(vertices) * 2.0 *
                              numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array(zip(vertex_x, vertex_y)).astype(numpy.float32)

    args = [bitmap, points, vertex_xy, size]

    tune_params = OrderedDict()

    #tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    tune_params["block_size_x"] = [32 * i
                                   for i in range(1, 32)]  #multiple of 32

    tune_params["tile_size"] = [2**i for i in range(6)]
    tune_params["f_unroll"] = [
        i for i in range(1, 20) if float(vertices) / i == vertices // i
    ]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive",
                                     kernel_string,
                                     problem_size, [bitmap, points, size],
                                     params,
                                     cmem_args={"d_vertices": vertex_xy})
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly_host",
                                       host_string,
                                       problem_size,
                                       args,
                                       tune_params,
                                       grid_div_x=grid_div_x,
                                       answer=result,
                                       lang="C",
                                       verbose=True)

    return results, tune_params
def tune_pnpoly_kernel():

    with open(get_kernel_path() + 'pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2 * size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0

    vertex_seeds = numpy.sort(numpy.random.rand(vertices) * 2.0 *
                              numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array(zip(vertex_x, vertex_y)).astype(numpy.float32)

    args = [bitmap, points, size]

    # (vk.x-vj.x) / (vk.y-vj.y)
    slopes = numpy.zeros(vertices).astype(numpy.float32)
    for i in range(len(slopes)):
        if i == 0:
            slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] -
                                                        vertex_y[i])
        else:
            slopes[i] = (vertex_x[i - 1] - vertex_x[i]) / (vertex_y[i - 1] -
                                                           vertex_y[i])

    cmem_args = {'d_vertices': vertex_xy, "d_slopes": slopes}

    tune_params = OrderedDict()

    tune_params["block_size_x"] = [2**i for i in range(6, 10)]  #powers of two
    #tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32
    #tune_params["block_size_x"] = [256]                        #fixed size

    tune_params["tile_size"] = [2**i for i in range(6)]
    #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive",
                                     kernel_string,
                                     problem_size,
                                     args,
                                     params,
                                     cmem_args=cmem_args)
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly",
                                       kernel_string,
                                       problem_size,
                                       args,
                                       tune_params,
                                       grid_div_x=grid_div_x,
                                       cmem_args=cmem_args,
                                       answer=result)

    return results, tune_params
def tune_pnpoly_kernel():

    with open(get_kernel_path()+'pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0

    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, size]

    # (vk.x-vj.x) / (vk.y-vj.y)
    slopes = numpy.zeros(vertices).astype(numpy.float32)
    for i in range(len(slopes)):
        if i == 0:
            slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i])
        else:
            slopes[i] = (vertex_x[i-1] - vertex_x[i]) / (vertex_y[i-1] - vertex_y[i])

    cmem_args= {'d_vertices': vertex_xy, "d_slopes": slopes }

    tune_params = OrderedDict()

    tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    #tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32
    #tune_params["block_size_x"] = [256]                        #fixed size

    tune_params["tile_size"] = [2**i for i in range(6)]
    #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, args, params, cmem_args=cmem_args)
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result)

    return results, tune_params