Python stop_profiler示例，pycuda.driver.stop_profiler Python示例

示例#1

0

显示文件

文件： inference.py 项目： landersson/misc

def benchmark(datasrc, model):
    start = time.time()
    label, data = datasrc.get_item()
    print("Data load time: %.2fms" % ((time.time() - start) * 1000.0))

    start = time.time()
    data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data,2), 0)).astype(model.dtype)
    data = model.normalize(data)
    print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0))

    input_tensor = GPUTensor(data)
    # warmup...
    for i in range(1):
         model.evaluate(input_tensor)
    start = time.time()
    num_iterations = 100
    print("Timing %d iterations..." % num_iterations)
    for i in range(num_iterations):
        if i == num_iterations - 1:
            drv.start_profiler()
        y = model.evaluate(input_tensor)
        print(y)
    drv.stop_profiler()

    et = (time.time() - start) * 1000 / num_iterations
    print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))

示例#2

0

显示文件

def benchmark(datasrc, model):
    start = time.time()
    label, data = datasrc.get_item()
    print("Data load time: %.2fms" % ((time.time() - start) * 1000.0))

    start = time.time()
    data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data, 2),
                                               0)).astype(model.dtype)
    data = model.normalize(data)
    print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0))

    input_tensor = GPUTensor(data)
    # warmup...
    for i in range(1):
        model.evaluate(input_tensor)
    start = time.time()
    num_iterations = 100
    print("Timing %d iterations..." % num_iterations)
    for i in range(num_iterations):
        if i == num_iterations - 1:
            drv.start_profiler()
        y = model.evaluate(input_tensor)
        print(y)
    drv.stop_profiler()

    et = (time.time() - start) * 1000 / num_iterations
    print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))

示例#3

0

显示文件

文件： pycuda_3dlut.py 项目： toru-ver4/sip

def check_time_3dlut(grid_num=17, img_file='../Matrix/figure/src_img.png'):
    # profiler関連の設定
    # ----------------------------------------
    config_file = "./data/config.txt"
    log_file = "./data/profile_out.csv"
    output_mode = cuda.profiler_output_mode.KEY_VALUE_PAIR
    cuda.initialize_profiler(config_file, log_file, output_mode)

    # 3DLUTデータの作成
    # ----------------------------------------
    matrix_param = np.array([[0.2126, 0.7152, 0.0722],
                             [-0.114572, -0.385428, 0.5],
                             [0.5, -0.454153, -0.045847]])
    kwargs = {'mtx': matrix_param}
    lut = make_3dlut_data(grid_num=17, func=rgb2yuv_for_3dlut, **kwargs)

    # 3DLUTの適用
    # ----------------------------------------
    img = img_open_and_normalize(img_file)
    img_x86 = exec_3dlut_on_x86(img=img, lut=lut)
    cuda.start_profiler()
    img_gpu = exec_3dlut_on_gpu(img=img, lut=lut)
    cuda.stop_profiler()

    return img_x86, img_gpu

示例#4

0

显示文件

def main():
    # Parse command line
    args = parse_arguments()
    # Read input image
    img = misc.imread(args.input)
    # Initialize the figure to show images
    figure = plt.figure()
    figure.suptitle('Bloom shader effect: luminance threshold =  ' + str(args.lum_threshold) +
                    ', Gaussian kernel standard deviation = ' + str(args.sigma) +
                    ', standard deviations taken = ' + str(args.sigma_n))
    subplots_total = 3 if args.mode == 'both' else 2
    subplot_current = 1
    # Set original image to subplot
    subplot = figure.add_subplot(1, subplots_total, subplot_current)
    subplot.set_title('Original image')
    subplot_current += 1
    plt.imshow(img, cmap='gray')
    # Print parameters
    print('Bloom shader effect: ' + args.input + ' (' + str(img.shape[1]) + 'x' + str(img.shape[0]) +
          '), t = ' + str(args.lum_threshold) + ', s = ' + str(args.sigma) + ', n = ' + str(args.sigma_n) +
          (', o = ' + args.output if args.output is not None else '') + '\n')
    # Run CPU implementation
    if args.mode == 'cpu' or args.mode == 'both':
        t0 = time.perf_counter()
        img_bloomed_c = bloom_cpu(img, args.lum_threshold, args.sigma, args.sigma_n)
        t1 = time.perf_counter()
        elapsed_time = t1 - t0
        print('\tCPU elapsed time: ' + str(elapsed_time) + ' seconds')
        # Set CPU-bloomed image to subplot
        subplot = figure.add_subplot(1, subplots_total, subplot_current)
        plt.imshow(img_bloomed_c, cmap='gray')
        subplot.set_title('CPU-bloomed image: %.3f seconds' % elapsed_time)
        subplot_current += 1
    # Run GPU implementation
    if args.mode == 'both' or args.mode == 'gpu':
        t0 = time.perf_counter()
        img_bloomed_g = bloom_gpu(img, args.lum_threshold, args.sigma, args.sigma_n)
        t1 = time.perf_counter()
        elapsed_time = t1 - t0
        print('\tGPU elapsed time: ' + str(elapsed_time) + ' seconds')
        drv.stop_profiler()
        # Set GPU-bloomed image to subplot
        subplot = figure.add_subplot(1, subplots_total, subplot_current)
        plt.imshow(img_bloomed_g, cmap='gray')
        subplot.set_title('GPU-bloomed image: %.3f seconds' % elapsed_time)
        subplot_current += 1
    # Save bloomed image to file
    if args.output is not None:
        if args.mode == 'cpu':
            misc.imsave(args.output, img_bloomed_c)
        else:
            misc.imsave(args.output, img_bloomed_g)
    # Figure original and bloomed images
    if args.ui is True:
        plt.show()

示例#5

0

显示文件

文件： gpuFDMT.py 项目： ajosephy/FDMT

 def execute(self):
     cuda.start_profiler()
     timestamp = time()
     for i,(f,a) in enumerate(zip(self.funcs,self.args)):
         f(*a)
     gpu_transpose.prepared_call(grid2,block2,A)
     gpu_get_sigma.prepared_call(grid3,block3,A)
     cuda.Context.synchronize()
     read_results()
     cuda.stop_profiler()
     print "Time for DM tranform, transpose, sigma calculate, get max sigma: %.3f s" % (time()-timestamp)

示例#6

0

显示文件

文件： FrostTrap.py 项目： khoste/PP-Assignment-3

def frost_trap(kernel_func, args, ver):
    """ Solves the FrostTrap on a GPU with CUDA

    frost_trap creates and allocates the trap data on the GPU, initialize
    graphics and nvidia profiler, runs the kernel function until the delta
    is smaller than a certain limit, finally, it dumps the execution time
    and parameters.

    Args:
            kernel_func : <simple|rb|dbuf> from kernel.cu
            args        : packed command-line options and arguments as a kwarg
    """

    verbose.info("create trap data and init visualizer")
    trap = gpuarray.to_gpu(create_trap(args.width, args.height))

    # delta   - total change
    # epsilon - minimum convergence limit
    epsilon = np.float32(0.001 * args.width * args.height)
    delta = np.array(epsilon)

    # init visualizer
    vis = Visualizer(trap, do_vis=args.graphic)

    # start nvidia profiler
    if args.profile:
        cuda.start_profiler()

    # start
    verbose.info("start")
    t1 = timeit.default_timer()
    
    while delta <= epsilon:
        # experiment with different block and grid sizes
        kernel_func(trap, args.height, args.width, args.omega, epsilon, args.iter, delta, block=(1, 300, 1), grid=(300,1,1))
        vis.update()

    t2 = timeit.default_timer()
    verbose.info("done")
    # stop

    # stop nvidia profiler
    if args.profile:
        cuda.stop_profiler()

    # dump result and parameters as json
    with open(args.result, "a+") as f:
        verbose.info("dumping results to {args.result}")
        args.secs = t2-t1
        json.dump(vars(args), f, default=lambda x: eval(str(x)), indent=2)

示例#7

0

显示文件

def simulate_gpu():
    driver.start_profiler()
    particle_pointer = cuda.mem_alloc(NUM_PARTICLES * ParticleStruct.mem_size)

    particles = []
    for i in range(NUM_PARTICLES):
        p = ParticleStruct(
            numpy.random.randn(3).astype(numpy.float32),
            numpy.random.randn(3).astype(numpy.float32),
            int(particle_pointer) + i * ParticleStruct.mem_size)
        particles.append(p)

    print("GPU initial:", list(map(lambda p: str(p), particles)))
    func = mod.get_function("simulate")
    func(particle_pointer,
         numpy.int32(NUM_ITERATIONS),
         grid=(GRID_SIZE, 1),
         block=(BLOCK_SIZE, 1, 1))
    print("GPU result:", list(map(lambda p: str(p), particles)))
    driver.stop_profiler()

示例#8

0

显示文件

文件： rbf_benchmark.py 项目： mondus/GPyBenchmark

start = cuda.Event()
end = cuda.Event()

start.record()
r1gdev = psicomp_gpu.psicomputations(kern, Z, qX, return_psi2_n=False)
end.record()
end.synchronize()
print('RBF psi-stat computation time: '+'%.2f'%(start.time_till(end))+' msec.')


#st_time = time.time()
#r2g = psicomp_gpu.psiDerivativecomputations(kern, w1, w2, w3, Z, qX)
#print('RBF psi-stat derivative computation time: '+'%.2f'%(time.time()-st_time)+' sec.')

#st_time = time.time()
#r3g = psicomp_gpu.psicomputations(kern, Z, qX, return_psi2_n=True)
#print('RBF psi-stat (psi2n) computation time: '+'%.2f'%(time.time()-st_time)+' sec.')

#st_time = time.time()
#r4g = psicomp_gpu.psiDerivativecomputations(kern, w1, w2, w3n, Z, qX)
#print('RBF psi-stat derivative (psi2n) computation time: '+'%.2f'%(time.time()-st_time)+' sec.')

assert np.all([np.allclose(a,b) for a,b in zip(r1g,r1gdev)])
#assert np.all([np.allclose(a,b) for a,b in zip(r1,r1g)])
#assert np.all([np.allclose(a,b) for a,b in zip(r2,r2g)])
#assert np.all([np.allclose(a,b) for a,b in zip(r3,r3g)])
#assert np.all([np.allclose(a,b) for a,b in zip(r4,r4g)])

cuda.stop_profiler()

示例#9

0

显示文件

    print ' and py product = ', c_py
    print ' and naive_gpu_prod = ', c1_gpu
    print ' and local_scalar_opt_prod = ', c2_gpu
    print ' and pvt mem scalar opt prod = ', c3_gpu
    print 'matrix product symmetric?   ', np.allclose(c_py, np.transpose(c_py))
    #    print 'All matrix products equal ', np.allclose(c_py,c1_gpu)
    print 'All matrix products equal ', (np.allclose(c_py, c1_gpu)
                                         and np.allclose(c1_gpu, c2_gpu)
                                         and np.allclose(c2_gpu, c3_gpu))
    print 'matrix dimansion=', L, 'X', M, '   mult py time:', mult_py_times[
        r], ' gpu naive time:', naive_cu_times[r]
    print ' gpu local scalar time:', local_cu_times[
        r], ' gpu private mem time:', pvt_cu_times[r]
    print '***********part 2 ends here *****************'

driver.stop_profiler()

# Optional: if you want to plot the function, set MAKE_PLOT to
# True:
MAKE_PLOT = True
if MAKE_PLOT:
    import matplotlib as mpl
    mpl.use('agg')
    import matplotlib.pyplot as plt

    plt.gcf()
    plt.subplot(311)
    plt.plot(x, transp_py_times, 'r')
    plt.plot(x, transp_cu_times, 'g')
    plt.legend(['python transpose', 'cuda transpose'], loc='upper left')
    plt.xlabel('matrix ratio increase factor')

示例#10

0

显示文件

    def test_diagonal_bl_tr(self):

        IMAGE_DIR = "Backpack-perfect"
        im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png"))
        im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png"))

        stereo = SemiGlobalMatching(im1,
                                    im2,
                                    os.path.join("../data", IMAGE_DIR,
                                                 "calib.txt"),
                                    window_size=3,
                                    resize=(640, 480))

        params = {
            "p1": 5,
            "p2": 90000,
            "census_kernel_size": 7,
            "reversed": True
        }
        stereo.set_params(params)
        stereo.params['ndisp'] = 50
        t1 = time()

        assert stereo.p1 is not None, "parameters have not been set"
        t1 = time()
        cim1 = stereo.census_transform(stereo.im1)
        cim2 = stereo.census_transform(stereo.im2)
        #print(f"census transform time {time() - t1}")

        if not stereo.reversed:
            D = range(int(stereo.params['ndisp']))
        else:
            D = reversed(range(int(-stereo.params['ndisp']), 1))
        cost_images = stereo.compute_disparity_img(cim1, cim2, D)
        cost_images = np.float32(cost_images)
        m, n, D = cost_images.shape
        # direction == (1,0)
        stereo.directions = [(-1, 1)]
        t1 = time()
        L = stereo.aggregate_cost(cost_images)
        print("python aggregate cost %f" % (time() - t1))
        L = L.transpose((2, 0, 1))

        cost_images = cost_images.transpose((2, 0, 1))
        cost_images = np.ascontiguousarray(cost_images, dtype=np.float32)
        d, rows, cols = cost_images.shape
        d_step = 1

        compiler_constants = {
            'D_STEP': d_step,
            'D': d,
            'ARR_SIZE': math.floor(d / d_step),
            'P1': 5,
            'P2': 90000,
            'SHMEM_SIZE': 64
        }
        build_options = [format_compiler_constants(compiler_constants)]
        mod = SourceModule(open("../lib/sgbm_helper.cu").read(),
                           options=build_options)

        diagonal_aggregate = mod.get_function("diagonal_bl_tr_aggregate")
        out = np.zeros_like(L)
        out = np.ascontiguousarray(out, dtype=np.float32)
        t1 = time()
        diagonal_aggregate(drv.Out(out),
                           drv.In(cost_images),
                           np.int32(rows),
                           np.int32(cols),
                           block=(256, 1, 1),
                           grid=(1, 1))
        print("cuda aggregate cost %f" % (time() - t1))
        drv.stop_profiler()
        s1 = np.sum(np.float64(L))
        s2 = np.sum(np.float64(out))

        print("L sum: %f" % s1)
        print("out sum: %f" % s2)
        self.assertTrue(np.all(np.isclose(out, L)))

示例#11

0

显示文件

def run_model(args, graph, inputs, outputs, data):
    # must use level0 to avoid unintended opr modification
    graph.options.graph_opt_level = 0

    logger.info("input tensors: ")
    for k, v in data.items():
        logger.info("  {}: {}".format(k, v.shape))

    G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args))

    if args.optimize_for_inference:
        opt_kwargs = get_opt_kwargs(args)
        outputs = G.optimize_for_inference(outputs, **opt_kwargs)

    # embed inputs must be on the last, to avoid const fold
    if args.embed_input:
        outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs)
    else:
        outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs)

    if args.dump_cpp_model:
        dump_content, _ = G.dump_graph(outputs, keep_var_name=2)
        with open(args.dump_cpp_model, "wb") as file:
            file.write(dump_content)
        logger.info("C++ model written to {}".format(args.dump_cpp_model))

    outputs, output_dict = tools.convert_outputs(outputs)

    if args.profile:
        profiler = tools.GraphProfiler(graph)

    func = graph.compile(outputs)

    def run():
        if not args.embed_input:
            for key in inp_dict:
                inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor())
        func.execute()
        func.wait()
        return [oup_node.get_value().numpy() for oup_node in output_dict.values()]

    if args.warm_up:
        logger.info("warming up")
        run()

    total_time = 0

    for i in range(args.iter):
        logger.info("iter {}".format(i))
        start_time = time.time()
        retval = run()
        cur_time = time.time() - start_time
        total_time += cur_time

        avg_speed = (i + 1) / total_time
        if "data" in data:
            avg_speed *= data["data"].shape[0]
            avg_speed_txt = "{:.3f}sample/s".format(avg_speed)
        else:
            avg_speed_txt = "{:.3f}batch/s".format(avg_speed)

        msg = (
            "iter {}: duration={:.4f}({:.4f})s average={:.4f}s "
            "avg_speed={} time={:.4f}s"
        ).format(
            i,
            cur_time,
            func.get_prev_exec_time(),
            total_time / (i + 1),
            avg_speed_txt,
            total_time,
        )
        if args.calc_output_rms:
            rms = []
            for v in retval:
                rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5)))
            msg += " output_rms=[{}]".format(", ".join(rms))
        if logger.level > logging.INFO:
            print(msg)
        else:
            logger.info(msg)

    if args.focused_nvprof:
        if get_device_count("gpu") < 1:
            logger.warning(
                "No cuda device detected. ``focused_nvprof`` will be ignored."
            )
        else:
            try:
                import pycuda.driver as D

                D.start_profiler()
                func.execute()
                func.wait()
                D.stop_profiler()
            except ImportError:
                logger.error("`focused_nvprof need pycuda`", exc_info=True)

    if args.profile:
        with open(args.profile, "w") as fout:
            fout.write(profiler.get())

    return avg_speed

示例#12

0

显示文件

 def __del__(self):
     # flush profiling data to file
     cuda.stop_profiler()

示例#13

0

显示文件

def counting_vowels_in_text(text):
    """Returns the number of vowels found in the text?"""

    mod = SourceModule("""
    __global__ void count_vowels(char *text, int *results, int text_size, int chunk_size, int threads_per_block, int blocks_per_grid)
    {
        int index = blockDim.x * blockIdx.x + threadIdx.x;

        int start = index * chunk_size;
        int end = ( index + 1 ) * chunk_size;

        end = min( end, text_size );
        if (end < start)
        {
            return; 
        }

        int i = 0; 

        start = start * 4;
        end = end * 4;
        
        for(i = start; i <= end; i++){

            if (text[i] == 'a' || text[i] == 'A' || text[i] == 'e' || text[i] == 'E' || text[i] == 'i' 
                || text[i] == 'I' || text[i] =='o' || text[i] =='O' || text[i] == 'u' || text[i] == 'U' || text[i] == 'y' || text[i] == 'Y')
            {
                results[i] = 1;
            }
        }

    }
    """)
    cuda.start_profiler()

    max_text_size_in_mb = 100

    text_chunks = []
    text_chunks_count = math.ceil(
        len(text) / (max_text_size_in_mb * (1024**2)))

    while (len(text) > (max_text_size_in_mb * 1024**2)):
        text_chunk = text[:math.ceil(len(text) / (text_chunks_count))]
        text2 = text[math.ceil(len(text) / (text_chunks_count)):]

        text = text2
        text_chunks.append(text_chunk)

    text_chunks.append(text)

    cumulative_results = 0
    for text_chunk in text_chunks:
        device_text = gpuarray.to_gpu(numpy.array([text_chunk], dtype=str))
        device_results = gpuarray.zeros(len(text_chunk) * 4, dtype=numpy.int32)

        chunk_size = 1000
        threads_per_block = 512
        blocks_per_grid = numpy.int(
            math.ceil(len(text_chunk) / (chunk_size * threads_per_block)))

        device_text_size = numpy.int32(len(text_chunk))

        function = mod.get_function("count_vowels")
        function(device_text,
                 device_results,
                 device_text_size,
                 numpy.int32(chunk_size),
                 numpy.int32(blocks_per_grid),
                 block=(threads_per_block, 1, 1),
                 grid=(blocks_per_grid, 1, 1))

        host_results = device_results.get()

        results = numpy.count_nonzero(host_results == 1)
        cumulative_results += results

    cuda.stop_profiler()
    return cumulative_results

示例#14

0

显示文件

    def test_two_directions(self):

        IMAGE_DIR = "Backpack-perfect"
        im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png"))
        im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png"))

        stereo = SemiGlobalMatching(im1,
                                    im2,
                                    os.path.join("../data", IMAGE_DIR,
                                                 "calib.txt"),
                                    window_size=3,
                                    resize=(640, 480))

        params = {
            "p1": 5,
            "p2": 90000,
            "census_kernel_size": 7,
            "reversed": True
        }
        stereo.set_params(params)
        stereo.params['ndisp'] = 50
        t1 = time()

        assert stereo.p1 is not None, "parameters have not been set"
        t1 = time()
        cim1 = stereo.census_transform(stereo.im1)
        cim2 = stereo.census_transform(stereo.im2)
        #print(f"census transform time {time() - t1}")

        if not stereo.reversed:
            D = range(int(stereo.params['ndisp']))
        else:
            D = reversed(range(int(-stereo.params['ndisp']), 1))
        cost_images = stereo.compute_disparity_img(cim1, cim2, D)
        cost_images = np.float32(cost_images)
        m, n, D = cost_images.shape
        # direction == (1,0)
        stereo.directions = [(1, 0), (-1, 0), (1, 1), (-1, 1), (1, -1),
                             (-1, -1)]
        #stereo.directions = [(0,1)]
        t1 = time()
        L = stereo.aggregate_cost(cost_images)
        print("python aggregate cost %f" % (time() - t1))
        L = L.transpose((2, 0, 1))

        cost_images = cost_images.transpose((2, 0, 1))
        cost_images = np.ascontiguousarray(cost_images, dtype=np.float32)
        d, rows, cols = cost_images.shape
        d_step = 1
        rows = np.int32(rows)
        cols = np.int32(cols)

        compiler_constants = {
            'D_STEP': d_step,
            'D': d,
            'ARR_SIZE': math.floor(d / d_step),
            'P1': 5,
            'P2': 90000,
            'SHMEM_SIZE': 64
        }
        build_options = [format_compiler_constants(compiler_constants)]
        mod = SourceModule(open("../lib/sgbm_helper.cu").read(),
                           options=build_options)

        shmem_size = 16
        vertical_blocks = int(math.ceil(rows / shmem_size))

        #r_aggregate = mod.get_function('r_aggregate')
        vertical_aggregate_down = mod.get_function('vertical_aggregate_down')
        vertical_aggregate_up = mod.get_function('vertical_aggregate_up')
        diagonal_br_tl_aggregate = mod.get_function('diagonal_br_tl_aggregate')
        diagonal_tl_br_aggregate = mod.get_function('diagonal_tl_br_aggregate')
        diagonal_tr_bl_aggregate = mod.get_function('diagonal_tr_bl_aggregate')
        diagonal_bl_tr_aggregate = mod.get_function('diagonal_bl_tr_aggregate')
        #l_aggregate = mod.get_function('l_aggregate')

        t1 = time()
        cost_images_ptr = drv.to_device(cost_images)
        dp_ptr = drv.mem_alloc(cost_images.nbytes)

        vertical_aggregate_down(dp_ptr,
                                cost_images_ptr,
                                rows,
                                cols,
                                block=(256, 1, 1),
                                grid=(1, 1))
        vertical_aggregate_up(dp_ptr,
                              cost_images_ptr,
                              rows,
                              cols,
                              block=(256, 1, 1),
                              grid=(1, 1))

        #r_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks))
        #l_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks))

        diagonal_tl_br_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))
        diagonal_bl_tr_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))

        diagonal_tr_bl_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))
        diagonal_br_tl_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))

        print("cuda aggregate cost %f" % (time() - t1))
        drv.stop_profiler()

        agg_image = drv.from_device(dp_ptr,
                                    cost_images.shape,
                                    dtype=np.float32)
        s1 = np.sum(np.float64(L))
        s2 = np.sum(np.float64(agg_image))

        print("L sum: %f" % s1)
        print("out sum: %f" % s2)
        self.assertTrue(np.all(np.isclose(agg_image, L)))

示例#15

0

显示文件

文件： cuda_l1_wvd.py 项目： scarab187/pymack04

    return run


if __name__ == '__main__':
    import sys
    N = int(sys.argv[2])
    Nb = int(sys.argv[1])
    i = int(sys.argv[3])
    eps = float(sys.argv[4])
    runner = l1_wvd(N, Nb)
    np.random.seed(5432)
    z = np.random.randn(N) + 1j * np.random.randn(N)
    print('warming up?', end=' ')
    for _ in range(3):
        c, dcr, dci = runner(z)
        print('.', end=' ')
    print('result =', c)
    print('calling...', end=' ')
    t0 = time.time()
    c1, dcr, dci = runner(z)
    t1 = time.time() - t0
    print('finished:', t1)
    dz = np.eye(N)[i]
    c2, _, _ = runner(z + eps * dz)
    c3, _, _ = runner(z + 1j * eps * dz)
    dcr2 = (c2 - c1) / eps
    dci2 = (c3 - c1) / eps
    print('grad re: calc =', dcr[i], 'appx =', dcr2)
    print('grad im: calc =', dci[i], 'appx =', dci2)
    drv.stop_profiler()

示例#16

0

显示文件

文件： benchmarks.py 项目： renlliang3/cuvarbase

 def profiled_func(*args, **kwargs):
     cuda.start_profiler()
     func(*args, **kwargs)
     cuda.stop_profiler()
     #pycuda.autoinit.context.detach()
     sys.exit()

示例#17

0

显示文件

文件： cuda_render.py 项目： rokroskar/cuda_render

def cu_template_render_image_single(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128):
    """
    CPU part of the SPH render code that executes the rendering on the GPU
    
    does some basic particle set prunning and sets up the image
    tiles. It launches cuda kernels for rendering the individual sections of the image
    """
    import pycuda.driver as drv
    import pycuda.tools
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    from radix_sort import radix_sort

    global_start = time.clock()

    start = time.clock()
    # construct an array of particles
    Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')]
    ps = drv.pagelocked_empty(len(s),dtype=Partstruct)
    
    with s.immediate_mode : 
        ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']]

    if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start)

    ymin,ymax = xmin,xmax

    # ----------------------
    # setup the global image
    # ----------------------
    image = np.zeros((nx,ny),dtype=np.float32)
    
    dx = float32((xmax-xmin)/nx)
    dy = float32((ymax-ymin)/ny)
    
    x_start = xmin+dx/2
    y_start = ymin+dy/2

    zplane = 0.0

    start = time.clock()

    # ------------------
    # set up the kernels
    # ------------------
    code = file('/home/itp/roskar/homegrown/template_kernel.cu').read()
    mod = SourceModule(code)
    tile_histogram = mod.get_function("tile_histogram")
    distribute_particles = mod.get_function("distribute_particles")
    tile_render_kernel = mod.get_function("tile_render_kernel")
    calculate_keys = mod.get_function("calculate_keys")

    # allocate histogram array
    hist = np.zeros(Ntiles,dtype=np.int32)
    
    # transfer histogram array and particle data to GPU
    hist_gpu = drv.mem_alloc(hist.nbytes)
    drv.memcpy_htod(hist_gpu,hist)
    
    start_g = drv.Event()
    end_g = drv.Event()

    start_g.record()
    ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes)
    drv.memcpy_htod(ps_on_gpu,ps_gpu)
    end_g.record()
    end_g.synchronize()

    if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g))

    # make everything the right size
    xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax])
    nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles])

    start_g.record()
    tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles,
                   block=(nthreads,1,1),grid=(32,1,1))

    drv.Context.synchronize()
    drv.memcpy_dtoh(hist,hist_gpu)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g))
    print "<<< Total particle array = %d"%(hist.sum())

    # ---------------------------------------------------------------------------------
    # figured out the numbers of particles per tile -- set up the tile particle buffers
    # ---------------------------------------------------------------------------------
    ps_tiles = np.empty(hist.sum(),dtype=Partstruct)
    ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes)

    tile_offsets = np.array([0],dtype=np.int32)
    tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32))
    tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes)
    drv.memcpy_htod(tile_offsets_gpu,tile_offsets)

    start_g.record()
    distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), 
                         xmin, xmax, ymin, ymax, nx, ny, Ntiles, 
                         block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g))
    drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu)

    
    # -------------------------
    # start going through tiles
    # -------------------------
   
    # initialize the image on the device
    im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes)
    drv.memcpy_htod(im_gpu,image.astype(np.float32))
   

    # allocate key arrays -- these will be keys to sort particles into softening bins
    start_g.record()
    keys_gpu = drv.mem_alloc(int(4*hist.sum()))
    calculate_keys(ps_tiles_gpu, keys_gpu, np.int32(hist.sum()), np.float32(dx), 
                   block=(nthreads,1,1),grid=(32,1,1))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g))

    keys = np.empty(hist.sum(), dtype=np.int32)


    # ----------------------------------------
    # sort particles by their softening length
    # ----------------------------------------
    for i in xrange(Ntiles) : 
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            radix_sort(int(keys_gpu), int(ps_tiles_gpu), tile_offsets[i], n_per_tile)

    drv.memcpy_dtoh(keys,keys_gpu)
    drv.memcpy_dtoh(ps_tiles,ps_tiles_gpu)
#    return keys,ps_tiles,tile_offsets,dx
        
    drv.Context.synchronize()

    tile_start = time.clock()
    for i in xrange(Ntiles) :
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            my_stream = streams[i%16]
            
            xmin_p, xmax_p, ymin_p, ymax_p  = tiles_physical[i]
            xmin_t, xmax_t, ymin_t, ymax_t  = tiles_pix[i]
            
            nx_tile = xmax_t-xmin_t+1
            ny_tile = ymax_t-ymin_t+1
                    
                
            # make everything the right size
            xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t])
            xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p])
            
            tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i),
                               xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t,
                               im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]),
                               block=(nthreads,1,1),stream=my_stream)

    if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start)
    
    # ----------------------------------------------------------------------------------
    # process the particles with large smoothing lengths concurrently with GPU execution
    # ----------------------------------------------------------------------------------
    #if ind[1] != len(xs) : 
    #    start = time.clock()
    #    image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:],
    #                                  nx,ny,xmin,xmax,ymin,ymax)).T
    #    if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1],
    #                                                                                           time.clock()-start)
    drv.Context.synchronize()
    if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start)

    drv.memcpy_dtoh(image,im_gpu)
    drv.stop_profiler()
    
    if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start)

    del(start_g)
    del(end_g)
    
    return image

示例#18

0

显示文件

start = time.time()
y_py = numpy.transpose(a)
time1 = time.time() - start

a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

b_gpu = gpuarray.empty((n, m), a.dtype)

t_result = numpy.empty((n, m), a.dtype)

func = mod.get_function("transpose")
start = time.time()
func(a_gpu,
     b_gpu,
     numpy.uint32(m),
     numpy.uint32(n),
     block=(16, 16, 1),
     grid=((numpy.uint32(n) - 1) / 16 + 1, (numpy.uint32(m) - 1) / 16 + 1, 1))
time2 = time.time() - start
t_result = b_gpu.get()

start = time.time()
time3 = time.time() - start

#cuda.memcpy_dtoh(t_result, b_gpu)
print time1
print time2
print time3
cuda.stop_profiler()

示例#19

0

显示文件

文件： cuda_render.py 项目： rokroskar/cuda_render

def cu_template_render_image(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128, tile_size=100):
    """
    CPU part of the SPH render code that executes the rendering on the GPU
    
    does some basic particle set prunning and sets up the image
    tiles. It launches cuda kernels for rendering the individual sections of the image
    """
    import pycuda.driver as drv
    import pycuda.tools
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    from radix_sort import radix_sort

    global_start = time.clock()

    

    start = time.clock()
    # construct an array of particles
    Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')]
    ps = drv.pagelocked_empty(len(s),dtype=Partstruct)
    
    with s.immediate_mode : 
        ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']]

    if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start)

    ymin,ymax = xmin,xmax

    # ----------------------
    # setup the global image
    # ----------------------
    image = np.zeros((nx,ny),dtype=np.float32)
    
    dx = float32((xmax-xmin)/nx)
    dy = float32((ymax-ymin)/ny)
    
    x_start = xmin+dx/2
    y_start = ymin+dy/2

    zplane = 0.0

    # ------------------------------------------------------------------------------------------------
    # trim particles based on smoothing length -- the GPU will only render those that need < 32 pixels
    # ------------------------------------------------------------------------------------------------

    start = time.clock()
  #  gpu_bool = 2*ps['h'] < 15.*dx
    
    ps_gpu = ps#[gpu_bool]
   # ps_cpu = ps[~gpu_bool]
    #del(ps)
    if timing: '<<< Setting up gpu/cpu particle struct arrays took %f s'%(time.clock()-start)

    # -----------------------------------------------------------------
    # set up the image slices -- max. size is 100x100 pixels 
    # in this step only process particles that need kernels < 40 pixels
    # tiles are 100x100 = 1e4 pixels x 4 bytes = 40k
    # kernels are 31x31 pixels max = 3844 bytes
    # max shared memory size is 48k
    # -----------------------------------------------------------------
    
    start = time.clock()
    tiles_pix, tiles_physical = make_tiles(nx,ny,xmin,xmax,ymin,ymax,tile_size)
    if timing: print '<<< Tiles made in %f s'%(time.clock()-start)

    Ntiles = tiles_pix.shape[0]

     
    
    # ------------------
    # set up the kernels
    # ------------------
    code = file(os.path.join(os.path.dirname(__file__),'template_kernel.cu')).read()
    mod = SourceModule(code,options=["--ptxas-options=-v"])
    tile_histogram = mod.get_function("tile_histogram")
    distribute_particles = mod.get_function("distribute_particles")
    tile_render_kernel = mod.get_function("tile_render_kernel")
    calculate_keys = mod.get_function("calculate_keys")


    # -------------------------------------------------------------
    # set up streams and figure out particle distributions per tile 
    # -------------------------------------------------------------
   

    # allocate histogram array
    hist = np.zeros(Ntiles,dtype=np.int32)
    
    # transfer histogram array and particle data to GPU
    hist_gpu = drv.mem_alloc(hist.nbytes)
    drv.memcpy_htod(hist_gpu,hist)
    
    start_g = drv.Event()
    end_g = drv.Event()

    start_g.record()
    ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes)
    drv.memcpy_htod(ps_on_gpu,ps_gpu)
    end_g.record()
    end_g.synchronize()

    if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g))

    # make everything the right size
    xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax])
    nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles])

    # -----------------------------
    # calculate pixels per particle
    # -----------------------------

    # allocate key arrays -- these will be keys to sort particles into softening bins
    start_g.record()
    keys_gpu = drv.mem_alloc(int(4*len(s)))
    calculate_keys(ps_on_gpu, keys_gpu, np.int32(len(s)), np.float32(dx), 
                   block=(nthreads,1,1),grid=(1024,1,1))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g))

    # ----------------------------------------
    # sort particles by their softening length
    # ----------------------------------------
    start_g.record()
    radix_sort(int(keys_gpu), int(ps_on_gpu), np.int32(0), np.int32(len(s)))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Radix sorting all tiles took %f ms'%(start_g.time_till(end_g))

    start_g.record()
    tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles,
                   block=(nthreads,1,1),grid=(1024,1,1))

    drv.Context.synchronize()
    drv.memcpy_dtoh(hist,hist_gpu)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g))
    print "<<< Total particle array = %d"%(hist.sum())

    # ---------------------------------------------------------------------------------
    # figured out the numbers of particles per tile -- set up the tile particle buffers
    # ---------------------------------------------------------------------------------
    ps_tiles = np.empty(hist.sum(),dtype=Partstruct)
    ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes)

    tile_offsets = np.array([0],dtype=np.int32)
    tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32))
    tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes)
    drv.memcpy_htod(tile_offsets_gpu,tile_offsets)

    start_g.record()
    distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), 
                         xmin, xmax, ymin, ymax, nx, ny, Ntiles, 
                         block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g))
    drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu)

    
    # -------------------------
    # start going through tiles
    # -------------------------
   
    # initialize the image on the device
    im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes)
    drv.memcpy_htod(im_gpu,image.astype(np.float32))
   

    

  
    tile_start = time.clock()
    
    streams = [drv.Stream() for i in range(16)]    
    
    for i in xrange(Ntiles) :
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            my_stream = streams[i%(16)]
            
            xmin_p, xmax_p, ymin_p, ymax_p  = tiles_physical[i]
            xmin_t, xmax_t, ymin_t, ymax_t  = tiles_pix[i]
            
            nx_tile = xmax_t-xmin_t+1
            ny_tile = ymax_t-ymin_t+1
                    
                
            # make everything the right size
            xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t])
            xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p])
            
            if n_per_tile > nthreads*256: ngrid=128
            else : ngrid = 64
            
            tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i),
                               xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t,
                               im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]),
                               block=(nthreads,1,1),grid=(ngrid,1,1),stream=my_stream)

    if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start)
    
    # ----------------------------------------------------------------------------------
    # process the particles with large smoothing lengths concurrently with GPU execution
    # ----------------------------------------------------------------------------------
    #if ind[1] != len(xs) : 
    #    start = time.clock()
    #    image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:],
    #                                  nx,ny,xmin,xmax,ymin,ymax)).T
    #    if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1],
    #                                                                                           time.clock()-start)
    drv.Context.synchronize()
    if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start)

    drv.memcpy_dtoh(image,im_gpu)
    drv.stop_profiler()
    
    if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start)

    del(start_g)
    del(end_g)
    
    return image

示例#20

0

显示文件

def _worker(pid, did, dispatcher, temp_storage, total_edge_count, log_lock,
            merge_lock, exit_signal):
    try:
        logging_setup()
        with log_lock:
            logging.debug(
                'Clustering subprocess {} acquiring device {} started.'.format(
                    pid, did))

        drv.init()
        cuda_device = drv.Device(did)
        cuda_context = cuda_device.make_context()
        compiler_option = [
            '--fmad=true',
        ] if use_fmad else [
            '--fmad=false',
        ]
        cuda_module = SourceModule(get_source_code(), options=compiler_option)
        cuda_kernel = cuda_module.get_function('compute_dot_product')
        cuda_kernel.prepare('PPPPPPPPPP')

        threads = []
        exit_state = mp.Value(ctypes.c_uint32, 0)
        with log_lock:
            logging.debug(
                'Subprocess {}: Spawning {} threads for CUDA stream concurrency.'
                .format(pid, threads_per_device))
        for tid in range(threads_per_device):
            threads.append(
                Thread(target=_thread,
                       args=(pid, tid, cuda_context, cuda_kernel, dispatcher,
                             temp_storage, total_edge_count, log_lock,
                             merge_lock, exit_signal, exit_state)))
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        if exit_state.value == 1:
            err_msg = '\nSubprocess {}: Threads exited with abnormal exitcode.'.format(
                pid)
            logging.error(err_msg)
            raise Exception(err_msg)

        drv.stop_profiler()
        cuda_context.pop()
    except (Exception, KeyboardInterrupt) as e:
        if type(e) is KeyboardInterrupt:
            with log_lock:
                logging.debug(
                    'Subprocess {}: Received KeyboardInterrupt, exits now.'.
                    format(pid))
                logging.debug(
                    'Subprocess {}: Waiting threads to exit.'.format(pid))
            for t in threads:
                t.join()
            drv.stop_profiler()
            cuda_context.pop()
        else:
            with log_lock:
                logging.exception(
                    '\nSubprocess {}: Ended unexpectedly. Logging traceback:\n'
                    '==========TRACEBACK==========\n'.format(pid))
            drv.stop_profiler()
            cuda_context.pop()
            exit_signal.value = True
            sys.exit(1)
    return

示例#21

0

显示文件

    def test_horizontal_left(self):

        IMAGE_DIR = "Backpack-perfect"
        im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png"))
        im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png"))
        #im1 = im1[:32,:,:]
        #im2 = im2[:32,:,:]

        stereo = SemiGlobalMatching(im1,
                                    im2,
                                    os.path.join("../data", IMAGE_DIR,
                                                 "calib.txt"),
                                    window_size=3,
                                    resize=(640, 480))

        params = {
            "p1": 5,
            "p2": 90000,
            "census_kernel_size": 7,
            "reversed": True
        }
        stereo.set_params(params)
        stereo.params['ndisp'] = 50
        t1 = time()

        assert stereo.p1 is not None, "parameters have not been set"
        t1 = time()
        cim1 = stereo.census_transform(stereo.im1)
        cim2 = stereo.census_transform(stereo.im2)
        #print(f"census transform time {time() - t1}")
        #pdb.set_trace()
        if not stereo.reversed:
            D = range(int(stereo.params['ndisp']))
        else:
            D = reversed(range(int(-stereo.params['ndisp']), 1))
        cost_images = stereo.compute_disparity_img(cim1, cim2, D)
        cost_images = np.float32(cost_images)
        ##cost_images = cost_images[:,:,8:45]
        #cost_images = cost_images[:,:,9:50]
        #shape = (480,640,36)
        #cost_images = np.float32(np.random.normal(loc=100, size=shape))
        m, n, D = cost_images.shape
        # direction == (1,0)
        stereo.directions = [(0, -1)]
        t1 = time()
        L = stereo.aggregate_cost_optimization_test(cost_images)
        print("python aggregate cost %f" % (time() - t1))
        L = L.transpose((2, 0, 1))

        cost_images = cost_images.transpose((2, 0, 1))
        cost_images = np.ascontiguousarray(cost_images, dtype=np.float32)
        d, rows, cols = cost_images.shape
        d_step = 1

        shmem_size = 16
        compiler_constants = {
            'D_STEP': d_step,
            'D': d,
            'ARR_SIZE': math.floor(d / d_step),
            'P1': 5,
            'P2': 90000,
            'SHMEM_SIZE': shmem_size
        }
        build_options = [format_compiler_constants(compiler_constants)]
        mod = SourceModule(open("../lib/sgbm_helper.cu").read(),
                           options=build_options)

        l_aggregate = mod.get_function("l_aggregate")
        out = np.zeros_like(L)
        out = np.ascontiguousarray(out, dtype=np.float32)
        vertical_blocks = int(math.ceil(rows / shmem_size))
        t1 = time()
        # pycuda complains when block size is greater than 32 x 32
        l_aggregate(drv.Out(out),
                    drv.In(cost_images),
                    np.int32(rows),
                    np.int32(cols),
                    block=(shmem_size, shmem_size, 1),
                    grid=(1, vertical_blocks))
        print("cuda aggregate cost %f" % (time() - t1))
        drv.stop_profiler()
        s1 = np.sum(np.float64(L))
        s2 = np.sum(np.float64(out))

        print("L sum: %f" % s1)
        print("out sum: %f" % s2)

        #pdb.set_trace()
        self.assertTrue(np.all(np.isclose(out, L)))