def benchmark(datasrc, model): start = time.time() label, data = datasrc.get_item() print("Data load time: %.2fms" % ((time.time() - start) * 1000.0)) start = time.time() data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data,2), 0)).astype(model.dtype) data = model.normalize(data) print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0)) input_tensor = GPUTensor(data) # warmup... for i in range(1): model.evaluate(input_tensor) start = time.time() num_iterations = 100 print("Timing %d iterations..." % num_iterations) for i in range(num_iterations): if i == num_iterations - 1: drv.start_profiler() y = model.evaluate(input_tensor) print(y) drv.stop_profiler() et = (time.time() - start) * 1000 / num_iterations print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))
def benchmark(datasrc, model): start = time.time() label, data = datasrc.get_item() print("Data load time: %.2fms" % ((time.time() - start) * 1000.0)) start = time.time() data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data, 2), 0)).astype(model.dtype) data = model.normalize(data) print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0)) input_tensor = GPUTensor(data) # warmup... for i in range(1): model.evaluate(input_tensor) start = time.time() num_iterations = 100 print("Timing %d iterations..." % num_iterations) for i in range(num_iterations): if i == num_iterations - 1: drv.start_profiler() y = model.evaluate(input_tensor) print(y) drv.stop_profiler() et = (time.time() - start) * 1000 / num_iterations print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))
def check_time_3dlut(grid_num=17, img_file='../Matrix/figure/src_img.png'): # profiler関連の設定 # ---------------------------------------- config_file = "./data/config.txt" log_file = "./data/profile_out.csv" output_mode = cuda.profiler_output_mode.KEY_VALUE_PAIR cuda.initialize_profiler(config_file, log_file, output_mode) # 3DLUTデータの作成 # ---------------------------------------- matrix_param = np.array([[0.2126, 0.7152, 0.0722], [-0.114572, -0.385428, 0.5], [0.5, -0.454153, -0.045847]]) kwargs = {'mtx': matrix_param} lut = make_3dlut_data(grid_num=17, func=rgb2yuv_for_3dlut, **kwargs) # 3DLUTの適用 # ---------------------------------------- img = img_open_and_normalize(img_file) img_x86 = exec_3dlut_on_x86(img=img, lut=lut) cuda.start_profiler() img_gpu = exec_3dlut_on_gpu(img=img, lut=lut) cuda.stop_profiler() return img_x86, img_gpu
def main(): # Parse command line args = parse_arguments() # Read input image img = misc.imread(args.input) # Initialize the figure to show images figure = plt.figure() figure.suptitle('Bloom shader effect: luminance threshold = ' + str(args.lum_threshold) + ', Gaussian kernel standard deviation = ' + str(args.sigma) + ', standard deviations taken = ' + str(args.sigma_n)) subplots_total = 3 if args.mode == 'both' else 2 subplot_current = 1 # Set original image to subplot subplot = figure.add_subplot(1, subplots_total, subplot_current) subplot.set_title('Original image') subplot_current += 1 plt.imshow(img, cmap='gray') # Print parameters print('Bloom shader effect: ' + args.input + ' (' + str(img.shape[1]) + 'x' + str(img.shape[0]) + '), t = ' + str(args.lum_threshold) + ', s = ' + str(args.sigma) + ', n = ' + str(args.sigma_n) + (', o = ' + args.output if args.output is not None else '') + '\n') # Run CPU implementation if args.mode == 'cpu' or args.mode == 'both': t0 = time.perf_counter() img_bloomed_c = bloom_cpu(img, args.lum_threshold, args.sigma, args.sigma_n) t1 = time.perf_counter() elapsed_time = t1 - t0 print('\tCPU elapsed time: ' + str(elapsed_time) + ' seconds') # Set CPU-bloomed image to subplot subplot = figure.add_subplot(1, subplots_total, subplot_current) plt.imshow(img_bloomed_c, cmap='gray') subplot.set_title('CPU-bloomed image: %.3f seconds' % elapsed_time) subplot_current += 1 # Run GPU implementation if args.mode == 'both' or args.mode == 'gpu': t0 = time.perf_counter() img_bloomed_g = bloom_gpu(img, args.lum_threshold, args.sigma, args.sigma_n) t1 = time.perf_counter() elapsed_time = t1 - t0 print('\tGPU elapsed time: ' + str(elapsed_time) + ' seconds') drv.stop_profiler() # Set GPU-bloomed image to subplot subplot = figure.add_subplot(1, subplots_total, subplot_current) plt.imshow(img_bloomed_g, cmap='gray') subplot.set_title('GPU-bloomed image: %.3f seconds' % elapsed_time) subplot_current += 1 # Save bloomed image to file if args.output is not None: if args.mode == 'cpu': misc.imsave(args.output, img_bloomed_c) else: misc.imsave(args.output, img_bloomed_g) # Figure original and bloomed images if args.ui is True: plt.show()
def execute(self): cuda.start_profiler() timestamp = time() for i,(f,a) in enumerate(zip(self.funcs,self.args)): f(*a) gpu_transpose.prepared_call(grid2,block2,A) gpu_get_sigma.prepared_call(grid3,block3,A) cuda.Context.synchronize() read_results() cuda.stop_profiler() print "Time for DM tranform, transpose, sigma calculate, get max sigma: %.3f s" % (time()-timestamp)
def frost_trap(kernel_func, args, ver): """ Solves the FrostTrap on a GPU with CUDA frost_trap creates and allocates the trap data on the GPU, initialize graphics and nvidia profiler, runs the kernel function until the delta is smaller than a certain limit, finally, it dumps the execution time and parameters. Args: kernel_func : <simple|rb|dbuf> from kernel.cu args : packed command-line options and arguments as a kwarg """ verbose.info("create trap data and init visualizer") trap = gpuarray.to_gpu(create_trap(args.width, args.height)) # delta - total change # epsilon - minimum convergence limit epsilon = np.float32(0.001 * args.width * args.height) delta = np.array(epsilon) # init visualizer vis = Visualizer(trap, do_vis=args.graphic) # start nvidia profiler if args.profile: cuda.start_profiler() # start verbose.info("start") t1 = timeit.default_timer() while delta <= epsilon: # experiment with different block and grid sizes kernel_func(trap, args.height, args.width, args.omega, epsilon, args.iter, delta, block=(1, 300, 1), grid=(300,1,1)) vis.update() t2 = timeit.default_timer() verbose.info("done") # stop # stop nvidia profiler if args.profile: cuda.stop_profiler() # dump result and parameters as json with open(args.result, "a+") as f: verbose.info("dumping results to {args.result}") args.secs = t2-t1 json.dump(vars(args), f, default=lambda x: eval(str(x)), indent=2)
def simulate_gpu(): driver.start_profiler() particle_pointer = cuda.mem_alloc(NUM_PARTICLES * ParticleStruct.mem_size) particles = [] for i in range(NUM_PARTICLES): p = ParticleStruct( numpy.random.randn(3).astype(numpy.float32), numpy.random.randn(3).astype(numpy.float32), int(particle_pointer) + i * ParticleStruct.mem_size) particles.append(p) print("GPU initial:", list(map(lambda p: str(p), particles))) func = mod.get_function("simulate") func(particle_pointer, numpy.int32(NUM_ITERATIONS), grid=(GRID_SIZE, 1), block=(BLOCK_SIZE, 1, 1)) print("GPU result:", list(map(lambda p: str(p), particles))) driver.stop_profiler()
start = cuda.Event() end = cuda.Event() start.record() r1gdev = psicomp_gpu.psicomputations(kern, Z, qX, return_psi2_n=False) end.record() end.synchronize() print('RBF psi-stat computation time: '+'%.2f'%(start.time_till(end))+' msec.') #st_time = time.time() #r2g = psicomp_gpu.psiDerivativecomputations(kern, w1, w2, w3, Z, qX) #print('RBF psi-stat derivative computation time: '+'%.2f'%(time.time()-st_time)+' sec.') #st_time = time.time() #r3g = psicomp_gpu.psicomputations(kern, Z, qX, return_psi2_n=True) #print('RBF psi-stat (psi2n) computation time: '+'%.2f'%(time.time()-st_time)+' sec.') #st_time = time.time() #r4g = psicomp_gpu.psiDerivativecomputations(kern, w1, w2, w3n, Z, qX) #print('RBF psi-stat derivative (psi2n) computation time: '+'%.2f'%(time.time()-st_time)+' sec.') assert np.all([np.allclose(a,b) for a,b in zip(r1g,r1gdev)]) #assert np.all([np.allclose(a,b) for a,b in zip(r1,r1g)]) #assert np.all([np.allclose(a,b) for a,b in zip(r2,r2g)]) #assert np.all([np.allclose(a,b) for a,b in zip(r3,r3g)]) #assert np.all([np.allclose(a,b) for a,b in zip(r4,r4g)]) cuda.stop_profiler()
print ' and py product = ', c_py print ' and naive_gpu_prod = ', c1_gpu print ' and local_scalar_opt_prod = ', c2_gpu print ' and pvt mem scalar opt prod = ', c3_gpu print 'matrix product symmetric? ', np.allclose(c_py, np.transpose(c_py)) # print 'All matrix products equal ', np.allclose(c_py,c1_gpu) print 'All matrix products equal ', (np.allclose(c_py, c1_gpu) and np.allclose(c1_gpu, c2_gpu) and np.allclose(c2_gpu, c3_gpu)) print 'matrix dimansion=', L, 'X', M, ' mult py time:', mult_py_times[ r], ' gpu naive time:', naive_cu_times[r] print ' gpu local scalar time:', local_cu_times[ r], ' gpu private mem time:', pvt_cu_times[r] print '***********part 2 ends here *****************' driver.stop_profiler() # Optional: if you want to plot the function, set MAKE_PLOT to # True: MAKE_PLOT = True if MAKE_PLOT: import matplotlib as mpl mpl.use('agg') import matplotlib.pyplot as plt plt.gcf() plt.subplot(311) plt.plot(x, transp_py_times, 'r') plt.plot(x, transp_cu_times, 'g') plt.legend(['python transpose', 'cuda transpose'], loc='upper left') plt.xlabel('matrix ratio increase factor')
def test_diagonal_bl_tr(self): IMAGE_DIR = "Backpack-perfect" im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png")) im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png")) stereo = SemiGlobalMatching(im1, im2, os.path.join("../data", IMAGE_DIR, "calib.txt"), window_size=3, resize=(640, 480)) params = { "p1": 5, "p2": 90000, "census_kernel_size": 7, "reversed": True } stereo.set_params(params) stereo.params['ndisp'] = 50 t1 = time() assert stereo.p1 is not None, "parameters have not been set" t1 = time() cim1 = stereo.census_transform(stereo.im1) cim2 = stereo.census_transform(stereo.im2) #print(f"census transform time {time() - t1}") if not stereo.reversed: D = range(int(stereo.params['ndisp'])) else: D = reversed(range(int(-stereo.params['ndisp']), 1)) cost_images = stereo.compute_disparity_img(cim1, cim2, D) cost_images = np.float32(cost_images) m, n, D = cost_images.shape # direction == (1,0) stereo.directions = [(-1, 1)] t1 = time() L = stereo.aggregate_cost(cost_images) print("python aggregate cost %f" % (time() - t1)) L = L.transpose((2, 0, 1)) cost_images = cost_images.transpose((2, 0, 1)) cost_images = np.ascontiguousarray(cost_images, dtype=np.float32) d, rows, cols = cost_images.shape d_step = 1 compiler_constants = { 'D_STEP': d_step, 'D': d, 'ARR_SIZE': math.floor(d / d_step), 'P1': 5, 'P2': 90000, 'SHMEM_SIZE': 64 } build_options = [format_compiler_constants(compiler_constants)] mod = SourceModule(open("../lib/sgbm_helper.cu").read(), options=build_options) diagonal_aggregate = mod.get_function("diagonal_bl_tr_aggregate") out = np.zeros_like(L) out = np.ascontiguousarray(out, dtype=np.float32) t1 = time() diagonal_aggregate(drv.Out(out), drv.In(cost_images), np.int32(rows), np.int32(cols), block=(256, 1, 1), grid=(1, 1)) print("cuda aggregate cost %f" % (time() - t1)) drv.stop_profiler() s1 = np.sum(np.float64(L)) s2 = np.sum(np.float64(out)) print("L sum: %f" % s1) print("out sum: %f" % s2) self.assertTrue(np.all(np.isclose(out, L)))
def run_model(args, graph, inputs, outputs, data): # must use level0 to avoid unintended opr modification graph.options.graph_opt_level = 0 logger.info("input tensors: ") for k, v in data.items(): logger.info(" {}: {}".format(k, v.shape)) G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args)) if args.optimize_for_inference: opt_kwargs = get_opt_kwargs(args) outputs = G.optimize_for_inference(outputs, **opt_kwargs) # embed inputs must be on the last, to avoid const fold if args.embed_input: outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs) else: outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs) if args.dump_cpp_model: dump_content, _ = G.dump_graph(outputs, keep_var_name=2) with open(args.dump_cpp_model, "wb") as file: file.write(dump_content) logger.info("C++ model written to {}".format(args.dump_cpp_model)) outputs, output_dict = tools.convert_outputs(outputs) if args.profile: profiler = tools.GraphProfiler(graph) func = graph.compile(outputs) def run(): if not args.embed_input: for key in inp_dict: inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor()) func.execute() func.wait() return [oup_node.get_value().numpy() for oup_node in output_dict.values()] if args.warm_up: logger.info("warming up") run() total_time = 0 for i in range(args.iter): logger.info("iter {}".format(i)) start_time = time.time() retval = run() cur_time = time.time() - start_time total_time += cur_time avg_speed = (i + 1) / total_time if "data" in data: avg_speed *= data["data"].shape[0] avg_speed_txt = "{:.3f}sample/s".format(avg_speed) else: avg_speed_txt = "{:.3f}batch/s".format(avg_speed) msg = ( "iter {}: duration={:.4f}({:.4f})s average={:.4f}s " "avg_speed={} time={:.4f}s" ).format( i, cur_time, func.get_prev_exec_time(), total_time / (i + 1), avg_speed_txt, total_time, ) if args.calc_output_rms: rms = [] for v in retval: rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5))) msg += " output_rms=[{}]".format(", ".join(rms)) if logger.level > logging.INFO: print(msg) else: logger.info(msg) if args.focused_nvprof: if get_device_count("gpu") < 1: logger.warning( "No cuda device detected. ``focused_nvprof`` will be ignored." ) else: try: import pycuda.driver as D D.start_profiler() func.execute() func.wait() D.stop_profiler() except ImportError: logger.error("`focused_nvprof need pycuda`", exc_info=True) if args.profile: with open(args.profile, "w") as fout: fout.write(profiler.get()) return avg_speed
def __del__(self): # flush profiling data to file cuda.stop_profiler()
def counting_vowels_in_text(text): """Returns the number of vowels found in the text?""" mod = SourceModule(""" __global__ void count_vowels(char *text, int *results, int text_size, int chunk_size, int threads_per_block, int blocks_per_grid) { int index = blockDim.x * blockIdx.x + threadIdx.x; int start = index * chunk_size; int end = ( index + 1 ) * chunk_size; end = min( end, text_size ); if (end < start) { return; } int i = 0; start = start * 4; end = end * 4; for(i = start; i <= end; i++){ if (text[i] == 'a' || text[i] == 'A' || text[i] == 'e' || text[i] == 'E' || text[i] == 'i' || text[i] == 'I' || text[i] =='o' || text[i] =='O' || text[i] == 'u' || text[i] == 'U' || text[i] == 'y' || text[i] == 'Y') { results[i] = 1; } } } """) cuda.start_profiler() max_text_size_in_mb = 100 text_chunks = [] text_chunks_count = math.ceil( len(text) / (max_text_size_in_mb * (1024**2))) while (len(text) > (max_text_size_in_mb * 1024**2)): text_chunk = text[:math.ceil(len(text) / (text_chunks_count))] text2 = text[math.ceil(len(text) / (text_chunks_count)):] text = text2 text_chunks.append(text_chunk) text_chunks.append(text) cumulative_results = 0 for text_chunk in text_chunks: device_text = gpuarray.to_gpu(numpy.array([text_chunk], dtype=str)) device_results = gpuarray.zeros(len(text_chunk) * 4, dtype=numpy.int32) chunk_size = 1000 threads_per_block = 512 blocks_per_grid = numpy.int( math.ceil(len(text_chunk) / (chunk_size * threads_per_block))) device_text_size = numpy.int32(len(text_chunk)) function = mod.get_function("count_vowels") function(device_text, device_results, device_text_size, numpy.int32(chunk_size), numpy.int32(blocks_per_grid), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1, 1)) host_results = device_results.get() results = numpy.count_nonzero(host_results == 1) cumulative_results += results cuda.stop_profiler() return cumulative_results
def test_two_directions(self): IMAGE_DIR = "Backpack-perfect" im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png")) im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png")) stereo = SemiGlobalMatching(im1, im2, os.path.join("../data", IMAGE_DIR, "calib.txt"), window_size=3, resize=(640, 480)) params = { "p1": 5, "p2": 90000, "census_kernel_size": 7, "reversed": True } stereo.set_params(params) stereo.params['ndisp'] = 50 t1 = time() assert stereo.p1 is not None, "parameters have not been set" t1 = time() cim1 = stereo.census_transform(stereo.im1) cim2 = stereo.census_transform(stereo.im2) #print(f"census transform time {time() - t1}") if not stereo.reversed: D = range(int(stereo.params['ndisp'])) else: D = reversed(range(int(-stereo.params['ndisp']), 1)) cost_images = stereo.compute_disparity_img(cim1, cim2, D) cost_images = np.float32(cost_images) m, n, D = cost_images.shape # direction == (1,0) stereo.directions = [(1, 0), (-1, 0), (1, 1), (-1, 1), (1, -1), (-1, -1)] #stereo.directions = [(0,1)] t1 = time() L = stereo.aggregate_cost(cost_images) print("python aggregate cost %f" % (time() - t1)) L = L.transpose((2, 0, 1)) cost_images = cost_images.transpose((2, 0, 1)) cost_images = np.ascontiguousarray(cost_images, dtype=np.float32) d, rows, cols = cost_images.shape d_step = 1 rows = np.int32(rows) cols = np.int32(cols) compiler_constants = { 'D_STEP': d_step, 'D': d, 'ARR_SIZE': math.floor(d / d_step), 'P1': 5, 'P2': 90000, 'SHMEM_SIZE': 64 } build_options = [format_compiler_constants(compiler_constants)] mod = SourceModule(open("../lib/sgbm_helper.cu").read(), options=build_options) shmem_size = 16 vertical_blocks = int(math.ceil(rows / shmem_size)) #r_aggregate = mod.get_function('r_aggregate') vertical_aggregate_down = mod.get_function('vertical_aggregate_down') vertical_aggregate_up = mod.get_function('vertical_aggregate_up') diagonal_br_tl_aggregate = mod.get_function('diagonal_br_tl_aggregate') diagonal_tl_br_aggregate = mod.get_function('diagonal_tl_br_aggregate') diagonal_tr_bl_aggregate = mod.get_function('diagonal_tr_bl_aggregate') diagonal_bl_tr_aggregate = mod.get_function('diagonal_bl_tr_aggregate') #l_aggregate = mod.get_function('l_aggregate') t1 = time() cost_images_ptr = drv.to_device(cost_images) dp_ptr = drv.mem_alloc(cost_images.nbytes) vertical_aggregate_down(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) vertical_aggregate_up(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) #r_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks)) #l_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks)) diagonal_tl_br_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_bl_tr_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_tr_bl_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) diagonal_br_tl_aggregate(dp_ptr, cost_images_ptr, rows, cols, block=(256, 1, 1), grid=(1, 1)) print("cuda aggregate cost %f" % (time() - t1)) drv.stop_profiler() agg_image = drv.from_device(dp_ptr, cost_images.shape, dtype=np.float32) s1 = np.sum(np.float64(L)) s2 = np.sum(np.float64(agg_image)) print("L sum: %f" % s1) print("out sum: %f" % s2) self.assertTrue(np.all(np.isclose(agg_image, L)))
return run if __name__ == '__main__': import sys N = int(sys.argv[2]) Nb = int(sys.argv[1]) i = int(sys.argv[3]) eps = float(sys.argv[4]) runner = l1_wvd(N, Nb) np.random.seed(5432) z = np.random.randn(N) + 1j * np.random.randn(N) print('warming up?', end=' ') for _ in range(3): c, dcr, dci = runner(z) print('.', end=' ') print('result =', c) print('calling...', end=' ') t0 = time.time() c1, dcr, dci = runner(z) t1 = time.time() - t0 print('finished:', t1) dz = np.eye(N)[i] c2, _, _ = runner(z + eps * dz) c3, _, _ = runner(z + 1j * eps * dz) dcr2 = (c2 - c1) / eps dci2 = (c3 - c1) / eps print('grad re: calc =', dcr[i], 'appx =', dcr2) print('grad im: calc =', dci[i], 'appx =', dci2) drv.stop_profiler()
def profiled_func(*args, **kwargs): cuda.start_profiler() func(*args, **kwargs) cuda.stop_profiler() #pycuda.autoinit.context.detach() sys.exit()
def cu_template_render_image_single(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128): """ CPU part of the SPH render code that executes the rendering on the GPU does some basic particle set prunning and sets up the image tiles. It launches cuda kernels for rendering the individual sections of the image """ import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule from radix_sort import radix_sort global_start = time.clock() start = time.clock() # construct an array of particles Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')] ps = drv.pagelocked_empty(len(s),dtype=Partstruct) with s.immediate_mode : ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']] if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start) ymin,ymax = xmin,xmax # ---------------------- # setup the global image # ---------------------- image = np.zeros((nx,ny),dtype=np.float32) dx = float32((xmax-xmin)/nx) dy = float32((ymax-ymin)/ny) x_start = xmin+dx/2 y_start = ymin+dy/2 zplane = 0.0 start = time.clock() # ------------------ # set up the kernels # ------------------ code = file('/home/itp/roskar/homegrown/template_kernel.cu').read() mod = SourceModule(code) tile_histogram = mod.get_function("tile_histogram") distribute_particles = mod.get_function("distribute_particles") tile_render_kernel = mod.get_function("tile_render_kernel") calculate_keys = mod.get_function("calculate_keys") # allocate histogram array hist = np.zeros(Ntiles,dtype=np.int32) # transfer histogram array and particle data to GPU hist_gpu = drv.mem_alloc(hist.nbytes) drv.memcpy_htod(hist_gpu,hist) start_g = drv.Event() end_g = drv.Event() start_g.record() ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes) drv.memcpy_htod(ps_on_gpu,ps_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g)) # make everything the right size xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax]) nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles]) start_g.record() tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles, block=(nthreads,1,1),grid=(32,1,1)) drv.Context.synchronize() drv.memcpy_dtoh(hist,hist_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g)) print "<<< Total particle array = %d"%(hist.sum()) # --------------------------------------------------------------------------------- # figured out the numbers of particles per tile -- set up the tile particle buffers # --------------------------------------------------------------------------------- ps_tiles = np.empty(hist.sum(),dtype=Partstruct) ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes) tile_offsets = np.array([0],dtype=np.int32) tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32)) tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes) drv.memcpy_htod(tile_offsets_gpu,tile_offsets) start_g.record() distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), xmin, xmax, ymin, ymax, nx, ny, Ntiles, block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4) end_g.record() end_g.synchronize() if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g)) drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu) # ------------------------- # start going through tiles # ------------------------- # initialize the image on the device im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes) drv.memcpy_htod(im_gpu,image.astype(np.float32)) # allocate key arrays -- these will be keys to sort particles into softening bins start_g.record() keys_gpu = drv.mem_alloc(int(4*hist.sum())) calculate_keys(ps_tiles_gpu, keys_gpu, np.int32(hist.sum()), np.float32(dx), block=(nthreads,1,1),grid=(32,1,1)) end_g.record() end_g.synchronize() if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g)) keys = np.empty(hist.sum(), dtype=np.int32) # ---------------------------------------- # sort particles by their softening length # ---------------------------------------- for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : radix_sort(int(keys_gpu), int(ps_tiles_gpu), tile_offsets[i], n_per_tile) drv.memcpy_dtoh(keys,keys_gpu) drv.memcpy_dtoh(ps_tiles,ps_tiles_gpu) # return keys,ps_tiles,tile_offsets,dx drv.Context.synchronize() tile_start = time.clock() for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : my_stream = streams[i%16] xmin_p, xmax_p, ymin_p, ymax_p = tiles_physical[i] xmin_t, xmax_t, ymin_t, ymax_t = tiles_pix[i] nx_tile = xmax_t-xmin_t+1 ny_tile = ymax_t-ymin_t+1 # make everything the right size xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t]) xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p]) tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i), xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t, im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]), block=(nthreads,1,1),stream=my_stream) if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start) # ---------------------------------------------------------------------------------- # process the particles with large smoothing lengths concurrently with GPU execution # ---------------------------------------------------------------------------------- #if ind[1] != len(xs) : # start = time.clock() # image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:], # nx,ny,xmin,xmax,ymin,ymax)).T # if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1], # time.clock()-start) drv.Context.synchronize() if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start) drv.memcpy_dtoh(image,im_gpu) drv.stop_profiler() if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start) del(start_g) del(end_g) return image
start = time.time() y_py = numpy.transpose(a) time1 = time.time() - start a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) b_gpu = gpuarray.empty((n, m), a.dtype) t_result = numpy.empty((n, m), a.dtype) func = mod.get_function("transpose") start = time.time() func(a_gpu, b_gpu, numpy.uint32(m), numpy.uint32(n), block=(16, 16, 1), grid=((numpy.uint32(n) - 1) / 16 + 1, (numpy.uint32(m) - 1) / 16 + 1, 1)) time2 = time.time() - start t_result = b_gpu.get() start = time.time() time3 = time.time() - start #cuda.memcpy_dtoh(t_result, b_gpu) print time1 print time2 print time3 cuda.stop_profiler()
def cu_template_render_image(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128, tile_size=100): """ CPU part of the SPH render code that executes the rendering on the GPU does some basic particle set prunning and sets up the image tiles. It launches cuda kernels for rendering the individual sections of the image """ import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule from radix_sort import radix_sort global_start = time.clock() start = time.clock() # construct an array of particles Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')] ps = drv.pagelocked_empty(len(s),dtype=Partstruct) with s.immediate_mode : ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']] if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start) ymin,ymax = xmin,xmax # ---------------------- # setup the global image # ---------------------- image = np.zeros((nx,ny),dtype=np.float32) dx = float32((xmax-xmin)/nx) dy = float32((ymax-ymin)/ny) x_start = xmin+dx/2 y_start = ymin+dy/2 zplane = 0.0 # ------------------------------------------------------------------------------------------------ # trim particles based on smoothing length -- the GPU will only render those that need < 32 pixels # ------------------------------------------------------------------------------------------------ start = time.clock() # gpu_bool = 2*ps['h'] < 15.*dx ps_gpu = ps#[gpu_bool] # ps_cpu = ps[~gpu_bool] #del(ps) if timing: '<<< Setting up gpu/cpu particle struct arrays took %f s'%(time.clock()-start) # ----------------------------------------------------------------- # set up the image slices -- max. size is 100x100 pixels # in this step only process particles that need kernels < 40 pixels # tiles are 100x100 = 1e4 pixels x 4 bytes = 40k # kernels are 31x31 pixels max = 3844 bytes # max shared memory size is 48k # ----------------------------------------------------------------- start = time.clock() tiles_pix, tiles_physical = make_tiles(nx,ny,xmin,xmax,ymin,ymax,tile_size) if timing: print '<<< Tiles made in %f s'%(time.clock()-start) Ntiles = tiles_pix.shape[0] # ------------------ # set up the kernels # ------------------ code = file(os.path.join(os.path.dirname(__file__),'template_kernel.cu')).read() mod = SourceModule(code,options=["--ptxas-options=-v"]) tile_histogram = mod.get_function("tile_histogram") distribute_particles = mod.get_function("distribute_particles") tile_render_kernel = mod.get_function("tile_render_kernel") calculate_keys = mod.get_function("calculate_keys") # ------------------------------------------------------------- # set up streams and figure out particle distributions per tile # ------------------------------------------------------------- # allocate histogram array hist = np.zeros(Ntiles,dtype=np.int32) # transfer histogram array and particle data to GPU hist_gpu = drv.mem_alloc(hist.nbytes) drv.memcpy_htod(hist_gpu,hist) start_g = drv.Event() end_g = drv.Event() start_g.record() ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes) drv.memcpy_htod(ps_on_gpu,ps_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g)) # make everything the right size xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax]) nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles]) # ----------------------------- # calculate pixels per particle # ----------------------------- # allocate key arrays -- these will be keys to sort particles into softening bins start_g.record() keys_gpu = drv.mem_alloc(int(4*len(s))) calculate_keys(ps_on_gpu, keys_gpu, np.int32(len(s)), np.float32(dx), block=(nthreads,1,1),grid=(1024,1,1)) end_g.record() end_g.synchronize() if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g)) # ---------------------------------------- # sort particles by their softening length # ---------------------------------------- start_g.record() radix_sort(int(keys_gpu), int(ps_on_gpu), np.int32(0), np.int32(len(s))) end_g.record() end_g.synchronize() if timing: print '<<< Radix sorting all tiles took %f ms'%(start_g.time_till(end_g)) start_g.record() tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles, block=(nthreads,1,1),grid=(1024,1,1)) drv.Context.synchronize() drv.memcpy_dtoh(hist,hist_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g)) print "<<< Total particle array = %d"%(hist.sum()) # --------------------------------------------------------------------------------- # figured out the numbers of particles per tile -- set up the tile particle buffers # --------------------------------------------------------------------------------- ps_tiles = np.empty(hist.sum(),dtype=Partstruct) ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes) tile_offsets = np.array([0],dtype=np.int32) tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32)) tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes) drv.memcpy_htod(tile_offsets_gpu,tile_offsets) start_g.record() distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), xmin, xmax, ymin, ymax, nx, ny, Ntiles, block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4) end_g.record() end_g.synchronize() if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g)) drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu) # ------------------------- # start going through tiles # ------------------------- # initialize the image on the device im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes) drv.memcpy_htod(im_gpu,image.astype(np.float32)) tile_start = time.clock() streams = [drv.Stream() for i in range(16)] for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : my_stream = streams[i%(16)] xmin_p, xmax_p, ymin_p, ymax_p = tiles_physical[i] xmin_t, xmax_t, ymin_t, ymax_t = tiles_pix[i] nx_tile = xmax_t-xmin_t+1 ny_tile = ymax_t-ymin_t+1 # make everything the right size xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t]) xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p]) if n_per_tile > nthreads*256: ngrid=128 else : ngrid = 64 tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i), xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t, im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]), block=(nthreads,1,1),grid=(ngrid,1,1),stream=my_stream) if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start) # ---------------------------------------------------------------------------------- # process the particles with large smoothing lengths concurrently with GPU execution # ---------------------------------------------------------------------------------- #if ind[1] != len(xs) : # start = time.clock() # image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:], # nx,ny,xmin,xmax,ymin,ymax)).T # if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1], # time.clock()-start) drv.Context.synchronize() if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start) drv.memcpy_dtoh(image,im_gpu) drv.stop_profiler() if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start) del(start_g) del(end_g) return image
def _worker(pid, did, dispatcher, temp_storage, total_edge_count, log_lock, merge_lock, exit_signal): try: logging_setup() with log_lock: logging.debug( 'Clustering subprocess {} acquiring device {} started.'.format( pid, did)) drv.init() cuda_device = drv.Device(did) cuda_context = cuda_device.make_context() compiler_option = [ '--fmad=true', ] if use_fmad else [ '--fmad=false', ] cuda_module = SourceModule(get_source_code(), options=compiler_option) cuda_kernel = cuda_module.get_function('compute_dot_product') cuda_kernel.prepare('PPPPPPPPPP') threads = [] exit_state = mp.Value(ctypes.c_uint32, 0) with log_lock: logging.debug( 'Subprocess {}: Spawning {} threads for CUDA stream concurrency.' .format(pid, threads_per_device)) for tid in range(threads_per_device): threads.append( Thread(target=_thread, args=(pid, tid, cuda_context, cuda_kernel, dispatcher, temp_storage, total_edge_count, log_lock, merge_lock, exit_signal, exit_state))) for t in threads: t.start() for t in threads: t.join() if exit_state.value == 1: err_msg = '\nSubprocess {}: Threads exited with abnormal exitcode.'.format( pid) logging.error(err_msg) raise Exception(err_msg) drv.stop_profiler() cuda_context.pop() except (Exception, KeyboardInterrupt) as e: if type(e) is KeyboardInterrupt: with log_lock: logging.debug( 'Subprocess {}: Received KeyboardInterrupt, exits now.'. format(pid)) logging.debug( 'Subprocess {}: Waiting threads to exit.'.format(pid)) for t in threads: t.join() drv.stop_profiler() cuda_context.pop() else: with log_lock: logging.exception( '\nSubprocess {}: Ended unexpectedly. Logging traceback:\n' '==========TRACEBACK==========\n'.format(pid)) drv.stop_profiler() cuda_context.pop() exit_signal.value = True sys.exit(1) return
def test_horizontal_left(self): IMAGE_DIR = "Backpack-perfect" im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png")) im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png")) #im1 = im1[:32,:,:] #im2 = im2[:32,:,:] stereo = SemiGlobalMatching(im1, im2, os.path.join("../data", IMAGE_DIR, "calib.txt"), window_size=3, resize=(640, 480)) params = { "p1": 5, "p2": 90000, "census_kernel_size": 7, "reversed": True } stereo.set_params(params) stereo.params['ndisp'] = 50 t1 = time() assert stereo.p1 is not None, "parameters have not been set" t1 = time() cim1 = stereo.census_transform(stereo.im1) cim2 = stereo.census_transform(stereo.im2) #print(f"census transform time {time() - t1}") #pdb.set_trace() if not stereo.reversed: D = range(int(stereo.params['ndisp'])) else: D = reversed(range(int(-stereo.params['ndisp']), 1)) cost_images = stereo.compute_disparity_img(cim1, cim2, D) cost_images = np.float32(cost_images) ##cost_images = cost_images[:,:,8:45] #cost_images = cost_images[:,:,9:50] #shape = (480,640,36) #cost_images = np.float32(np.random.normal(loc=100, size=shape)) m, n, D = cost_images.shape # direction == (1,0) stereo.directions = [(0, -1)] t1 = time() L = stereo.aggregate_cost_optimization_test(cost_images) print("python aggregate cost %f" % (time() - t1)) L = L.transpose((2, 0, 1)) cost_images = cost_images.transpose((2, 0, 1)) cost_images = np.ascontiguousarray(cost_images, dtype=np.float32) d, rows, cols = cost_images.shape d_step = 1 shmem_size = 16 compiler_constants = { 'D_STEP': d_step, 'D': d, 'ARR_SIZE': math.floor(d / d_step), 'P1': 5, 'P2': 90000, 'SHMEM_SIZE': shmem_size } build_options = [format_compiler_constants(compiler_constants)] mod = SourceModule(open("../lib/sgbm_helper.cu").read(), options=build_options) l_aggregate = mod.get_function("l_aggregate") out = np.zeros_like(L) out = np.ascontiguousarray(out, dtype=np.float32) vertical_blocks = int(math.ceil(rows / shmem_size)) t1 = time() # pycuda complains when block size is greater than 32 x 32 l_aggregate(drv.Out(out), drv.In(cost_images), np.int32(rows), np.int32(cols), block=(shmem_size, shmem_size, 1), grid=(1, vertical_blocks)) print("cuda aggregate cost %f" % (time() - t1)) drv.stop_profiler() s1 = np.sum(np.float64(L)) s2 = np.sum(np.float64(out)) print("L sum: %f" % s1) print("out sum: %f" % s2) #pdb.set_trace() self.assertTrue(np.all(np.isclose(out, L)))