def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = drv.SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32,8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) strm = drv.Stream() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=shape+(1,), stream=strm) strm.synchronize() self.assert_(la.norm(dest-a*b) == 0)
def set_array(self, array, copy=True): """ Sets the local array in the Surface to the first argument, array, a a numpy array. Parameters ---------- array : 2D numpy array Container defining the surface. copy: boolean When True, will immediately copy local data to device. """ self.local_array = drv.pagelocked_empty_like(array) self.bounds = array.shape if copy == True: self.local_to_device()
def function(in_array): nz, ny, nx = in_array.shape out_array = driver.pagelocked_empty_like( in_array, mem_flags=driver.host_alloc_flags.DEVICEMAP ) block_size = (32, 32, 1) grid_size = (-(-nx//32), -(-ny//32), 1) kernel( driver.Out(out_array), driver.In(in_array), np.int32(nx), np.int32(ny), np.int32(nz), grid=grid_size, block=block_size ) return out_array
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32, 8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1, ), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest - a * b) == 0
def run(self, array): shape = array.shape if len(shape) != 3: raise ValueError("only 3D array is allowed") nz, ny, nx = shape logger.debug("shape, nx={}, ny={}, nz={}".format(nx, ny, nz)) self._prepare_workspace(array.dtype) locked_in_array = driver.pagelocked_empty_like(array) locked_in_array[:] = array locked_out_array = self._copy(locked_in_array) out_array = np.empty_like(locked_out_array) out_array[:] = locked_out_array return out_array
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") shape = (32, 8) a = drv.pagelocked_zeros(shape, dtype=np.float32) b = drv.pagelocked_zeros(shape, dtype=np.float32) a[:] = np.random.randn(*shape) b[:] = np.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=shape+(1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest-a*b) == 0
def convert_validate_save( onnx_model_filename: str, golden_data_filename: 'Optional[str]' = '', atol: float = 1e-3, rtol: float = 1e-3, batch_size: int = 1, # debug: bool = False, **kwargs) -> bool: r""" inference model in 'tensorrt' validate with given golden data save if accuracy passed """ import numpy as np import pycuda.autoinit # noqa: just import, no code check import pycuda.driver as cuda import tensorrt as trt trt_logger = trt.Logger( trt.Logger.VERBOSE if debug else trt.Logger.WARNING) builder = trt.Builder(trt_logger) network = builder.create_network() parser = trt.OnnxParser(network, trt_logger) logger.info('loading ONNX model: %s ...', onnx_model_filename) with open(onnx_model_filename, 'rb') as fp: onnx_model_proto_str = fp.read() success = parser.parse(onnx_model_proto_str) if not success: logger.error('model parsing failed:') for idx_error in range(parser.num_errors): logger.error('\t%s', parser.get_error(idx_error)) return False logger.info('model parsing passed') workspace_size = kwargs.pop('workspace_size', 1024 * 1024 * 16) # default to 1024*1024*16 fp16_mode = kwargs.pop('fp16_mode', builder.platform_has_fast_fp16) int8_mode = kwargs.pop('int8_mode', builder.platform_has_fast_int8) builder.debug_sync = debug builder.fp16_mode = fp16_mode builder.max_batch_size = batch_size builder.max_workspace_size = workspace_size builder.refittable = False builder.strict_type_constraints = True logger.info('using batch_size: %d', builder.max_batch_size) logger.info('I/O type-shape info:') if int8_mode: default_range = (-127, +127) builder.int8_mode = int8_mode for layer in network: for idx_out in range(layer.num_outputs): var_out = layer.get_output(idx_out) var_out.set_dynamic_range(-127, +127) dynamic_ranges = kwargs.pop('io_dynamic_ranges', dict()) for idx_inp in range(network.num_inputs): var_inp = network.get_input(idx_inp) dr_lo, dr_hi = dynamic_ranges.get(var_inp.name, default_range) var_inp.set_dynamic_range(dr_lo, dr_hi) logger.info('\t input %d (%12s): %s%s in [%d, %d]', idx_inp, var_inp.name, var_inp.dtype, var_inp.shape, dr_lo, dr_hi) for idx_out in range(network.num_outputs): var_out = network.get_output(idx_out) dr_lo, dr_hi = dynamic_ranges.get(var_out.name, default_range) var_out.set_dynamic_range(dr_lo, dr_hi) logger.info('\toutput %d (%12s): %s%s in [%d, %d]', idx_out, var_out.name, var_out.dtype, var_out.shape, dr_lo, dr_hi) # TODO: int8 calibrate else: for idx_inp in range(network.num_inputs): var_inp = network.get_input(idx_inp) logger.info('\t input %d (%12s): %s%s', idx_inp, var_inp.name, var_inp.dtype, var_inp.shape) for idx_out in range(network.num_outputs): var_out = network.get_output(idx_out) logger.info('\toutput %d (%12s): %s%s', idx_out, var_out.name, var_out.dtype, var_out.shape) # not exposed # builder.getNbDLACores() > 0 # builder.allowGPUFallback(True) # builder.setDefaultDeviceType(kDLA) # builder.setDLACore(1) engine = builder.build_cuda_engine(network) if engine is None: logger.info('engine building failed') return False logger.info('engine building passed') # globals().update(locals()) if golden_data_filename: logger.info('using golden data %s', golden_data_filename) if golden_data_filename.endswith('.npz'): test_data = np.load( golden_data_filename, encoding='bytes', allow_pickle=True, ) input_data = test_data['inputs'].tolist() output_data = test_data['outputs'].tolist() else: test_data = np.load( golden_data_filename, encoding='bytes', allow_pickle=True, ).tolist() input_data = test_data['inputs'] output_data = test_data['outputs'] input_data = flatten_dict(input_data) output_data = flatten_dict(output_data) # input_names = input_data.keys() output_names = output_data.keys() logger.info('with %d inputs and %d outputs', len(input_data), len(output_data)) input_device_data = { name: cuda.to_device(value) for name, value in input_data.items() } output_device_data = { name: cuda.mem_alloc(value.nbytes) for name, value in output_data.items() } output_host_data = { name: cuda.pagelocked_empty_like(value) for name, value in output_data.items() } logger.info('data transfered to device') profiler = trt.Profiler() with engine.create_execution_context() as context: if debug: context.profiler = profiler stream = cuda.Stream() # for name in input_names: # cuda.memcpy_htod_async( # input_data[name], input_device_data[name], # stream=stream) device_data = list(input_device_data.values()) + list( output_device_data.values()) success = context.execute_async(batch_size, bindings=list(map( int, device_data)), stream_handle=stream.handle, input_consumed=None) if not success: logger.error('execution failed') return False for name in output_names: cuda.memcpy_dtoh_async( output_host_data[name], output_device_data[name], stream=stream, ) stream.synchronize() logger.info('execution passed') # output_host_data[name] = onnx2trt_inference( # onnx_model_filename, list(input_data.values()), # batch_size, workspace_size)[0] # validate passed = True if golden_data_filename: for name in output_names: pr = output_host_data[name] gt = output_data[name] logger.info('testing on output %s ...', name) try: np.testing.assert_allclose( pr, gt, rtol=rtol, atol=atol, equal_nan=False, verbose=True, ) except AssertionError as e: passed = False logger.error('failed: %s\n', e) logger.info('accuracy %spassed', '' if passed else 'not ') globals().update(locals()) if passed: trt_engine_filename = onnx_model_filename[:-len('.onnx' )] + '.bin' # or .trt with open(trt_engine_filename, 'wb') as fp: fp.write(engine.serialize()) logger.info('engine saved to %s', trt_engine_filename) return passed
def trikmeans_gpu(data, clusters, iterations, return_times = 0): """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels) K-means using triangle inequality algorithm and PyCuda Input arguments are the data, intial cluster values, and number of iterations to repeat. The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and nPts = number of data points. The shape of clusters is (nDim, nClusters) The return values are the updated clusters and labels for the data """ #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if(nPts > 32768): useTextureForData = 0 # block and grid sizes for the ccdist kernel (also for hdclosest) blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16)) gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist #block and grid sizes for the init module threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16) #blocksize_init = min(512, threads_desired) blocksize_init = min(128, threads_desired) gridsize_init = 1 + (threads_desired - 1)/blocksize_init #block and grid sizes for the step3 module blocksize_step3 = blocksize_init if not useTextureForData: blocksize_step3 = min(256, blocksize_step3) gridsize_step3 = gridsize_init #block and grid sizes for the step4 module # Each block of threads will handle seqcount times the data # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements blocksize_step4 = 2 while(blocksize_step4 < min(512, nPts)): blocksize_step4 *= 2 maxblocks = 512 seqcount_step4 = 1 + (nPts-1)/(blocksize_step4*maxblocks) gridsize_step4 = 1 + (nPts-1)/(seqcount_step4*blocksize_step4) blocksize_step4part2 = 1 while(blocksize_step4part2 < gridsize_step4): blocksize_step4part2 *= 2 #block and grid sizes for the calc_movement module for blocksize_calcm in range(32, 512, 32): if blocksize_calcm >= nClusters: break; gridsize_calcm = 1 + (nClusters-1)/blocksize_calcm #block and grid sizes for the step56 module blocksize_step56 = blocksize_init gridsize_step56 = gridsize_init #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters, blocksize_step4, seqcount_step4, gridsize_step4, blocksize_step4part2, useTextureForData) ccdist = mod_ccdist.get_function("ccdist") calc_hdclosest = mod_ccdist.get_function("calc_hdclosest") init = mod_ccdist.get_function("init") step3 = mod_ccdist.get_function("step3") step4 = mod_ccdist.get_function("step4") step4part2 = mod_ccdist.get_function("step4part2") calc_movement = mod_ccdist.get_function("calc_movement") step56 = mod_ccdist.get_function("step56") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2-t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_ccdist.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:,:] = data; gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:,:] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts,), np.int32) # cluster assignment gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between # point and each cluster gpu_upper = gpuarray.zeros((nPts,), np.float32) # upper bounds on distance between # point and any cluster gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32) # half distance to closest gpu_hdClosest.fill(1.0e10) # set to large value // **TODO** get the acutal float max gpu_badUpper = gpuarray.zeros((nPts,), np.int32) # flag to indicate upper bound needs recalc gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32); gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32); gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32) gpu_cluster_changed.fill(1) gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_step4), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_step4,), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2-t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- ccdist_time = 0. hdclosest_time = 0. init_time = 0. step3_time = 0. step4_time = 0. step56_time = 0. t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2-t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2-t1 t1 = time.time() if useTextureForData: init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block = (blocksize_init, 1, 1), grid = (gridsize_init, 1), texrefs=[texrefData]) else: init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block = (blocksize_init, 1, 1), grid = (gridsize_init, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() init_time += t2-t1 for i in range(iterations): if i>0: t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2-t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2-t1 t1 = time.time() if i > 0: gpu_cluster_changed.fill(0) if useTextureForData: step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block = (blocksize_step3, 1, 1), grid = (gridsize_step3, 1), texrefs=[texrefData]) else: step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block = (blocksize_step3, 1, 1), grid = (gridsize_step3, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step3_time += t2-t1 t1 = time.time() if useTextureForData: step4(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_step4, 1, 1), grid = (gridsize_step4, nDim), texrefs=[texrefData]) else: step4(gpu_data, gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_step4, 1, 1), grid = (gridsize_step4, nDim)) step4part2(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block = (blocksize_step4part2, 1, 1), grid = (1, nDim)) calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, gpu_cluster_changed, block = (blocksize_calcm, 1, 1), grid = (gridsize_calcm, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step4_time += t2-t1 t1 = time.time() if useTextureForData: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block = (blocksize_step56, 1, 1), grid = (gridsize_step56, 1), texrefs=[texrefData]) else: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block = (blocksize_step56, 1, 1), grid = (gridsize_step56, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step56_time += t2-t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \ gpu_clusters.get(), gpu_cluster_movement, \ data_time, module_time, init_time, \ ccdist_time/iterations, hdclosest_time/iterations, \ step3_time/iterations, step4_time/iterations, step56_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def trikmeans_gpu(data, clusters, iterations, return_times=0): """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels) K-means using triangle inequality algorithm and PyCuda Input arguments are the data, intial cluster values, and number of iterations to repeat. The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and nPts = number of data points. The shape of clusters is (nDim, nClusters) The return values are the updated clusters and labels for the data """ #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if (nPts > 32768): useTextureForData = 0 # block and grid sizes for the ccdist kernel (also for hdclosest) blocksize_ccdist = min(512, 16 * (1 + (nClusters - 1) / 16)) gridsize_ccdist = 1 + (nClusters - 1) / blocksize_ccdist #block and grid sizes for the init module threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16) #blocksize_init = min(512, threads_desired) blocksize_init = min(128, threads_desired) gridsize_init = 1 + (threads_desired - 1) / blocksize_init #block and grid sizes for the step3 module blocksize_step3 = blocksize_init if not useTextureForData: blocksize_step3 = min(256, blocksize_step3) gridsize_step3 = gridsize_init #block and grid sizes for the step4 module # Each block of threads will handle seqcount times the data # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements blocksize_step4 = 2 while (blocksize_step4 < min(512, nPts)): blocksize_step4 *= 2 maxblocks = 512 seqcount_step4 = 1 + (nPts - 1) / (blocksize_step4 * maxblocks) gridsize_step4 = 1 + (nPts - 1) / (seqcount_step4 * blocksize_step4) blocksize_step4part2 = 1 while (blocksize_step4part2 < gridsize_step4): blocksize_step4part2 *= 2 """ print "blocksize_step4 =", blocksize_step4 print "gridsize_step4 =", gridsize_step4 print "seqcount_step4 =", seqcount_step4 """ #block and grid sizes for the calc_movement module for blocksize_calcm in range(32, 512, 32): if blocksize_calcm >= nClusters: break gridsize_calcm = 1 + (nClusters - 1) / blocksize_calcm #block and grid sizes for the step56 module blocksize_step56 = blocksize_init gridsize_step56 = gridsize_init #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters, blocksize_step4, seqcount_step4, gridsize_step4, blocksize_step4part2, useTextureForData, BOUNDS) ccdist = mod_ccdist.get_function("ccdist") calc_hdclosest = mod_ccdist.get_function("calc_hdclosest") init = mod_ccdist.get_function("init") step3 = mod_ccdist.get_function("step3") step4 = mod_ccdist.get_function("step4") step4part2 = mod_ccdist.get_function("step4part2") calc_movement = mod_ccdist.get_function("calc_movement") step56 = mod_ccdist.get_function("step56") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2 - t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_ccdist.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:, :] = data gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:, :] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts, ), np.int32) # cluster assignment gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between # point and each cluster gpu_upper = gpuarray.zeros((nPts, ), np.float32) # upper bounds on distance between # point and any cluster gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances gpu_hdClosest = gpuarray.zeros((nClusters, ), np.float32) # half distance to closest gpu_hdClosest.fill( 1.0e10) # set to large value // **TODO** get the acutal float max gpu_badUpper = gpuarray.zeros( (nPts, ), np.int32) # flag to indicate upper bound needs recalc gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32) gpu_cluster_movement = gpuarray.zeros((nClusters, ), np.float32) gpu_cluster_changed = gpuarray.zeros((nClusters, ), np.int32) gpu_cluster_changed.fill(1) gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_step4), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_step4, ), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2 - t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- ccdist_time = 0. hdclosest_time = 0. init_time = 0. step3_time = 0. step4_time = 0. step56_time = 0. t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2 - t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2 - t1 t1 = time.time() if useTextureForData: init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block=(blocksize_init, 1, 1), grid=(gridsize_init, 1), texrefs=[texrefData]) else: init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block=(blocksize_init, 1, 1), grid=(gridsize_init, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() init_time += t2 - t1 for i in range(iterations): if i > 0: t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2 - t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2 - t1 t1 = time.time() if i > 0: gpu_cluster_changed.fill(0) if useTextureForData: step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block=(blocksize_step3, 1, 1), grid=(gridsize_step3, 1), texrefs=[texrefData]) else: step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block=(blocksize_step3, 1, 1), grid=(gridsize_step3, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step3_time += t2 - t1 t1 = time.time() if useTextureForData: step4(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_step4, 1, 1), grid=(gridsize_step4, nDim), texrefs=[texrefData]) else: step4(gpu_data, gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_step4, 1, 1), grid=(gridsize_step4, nDim)) step4part2(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block=(blocksize_step4part2, 1, 1), grid=(1, nDim)) calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, gpu_cluster_changed, block=(blocksize_calcm, 1, 1), grid=(gridsize_calcm, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step4_time += t2 - t1 t1 = time.time() if useTextureForData: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block=(blocksize_step56, 1, 1), grid=(gridsize_step56, 1), texrefs=[texrefData]) else: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block=(blocksize_step56, 1, 1), grid=(gridsize_step56, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step56_time += t2 - t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \ gpu_clusters.get(), gpu_cluster_movement, \ data_time, module_time, init_time, \ ccdist_time/iterations, hdclosest_time/iterations, \ step3_time/iterations, step4_time/iterations, step56_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def main(): parser = argparse.ArgumentParser(description="running inference ") parser.add_argument("-e", "--engine", type=str, help="location of runtime engine") parser.add_argument("-n", "--num_params", default=300, type=int, help="number of transformation parameters") parser.add_argument("-o", "--output", default="results/output_0.gif", help="The serialized engine file, ex inference.engine") parser.add_argument("-s", "--source_image", required=True, help="The source image to animate") parser.add_argument("-f", "--fps", default=10, help="fps parameter for the generated gif", type=int) parser.add_argument( "--store_frames", action='store_true', help="store generated PNGs instead of generating a gif") parser.add_argument("-si", "--size", default=[256, 256], action="append", type=int, help="spatial size of input image") args = parser.parse_args() frames = [] source_image = extract_numpy_image_from_filelike(args.source_image) morpher_params, rotator_params = gen_params(args.num_params) identity = torch.Tensor([[1, 0, 0], [0, 1, 0]]).unsqueeze(0).repeat(1, 1, 1) base_grid = F.affine_grid(identity, [1, 4, args.size[-2], args.size[-1]], align_corners=True) base_grid = np.array(base_grid, dtype=np.float32) TRT_LOGGER = trt.Logger(trt.Logger.INFO) #Load Custom plugin libraries ctypes.CDLL("./gridSamplerPlugin/libgridsampler.so", mode=ctypes.RTLD_GLOBAL) trt.init_libnvinfer_plugins(TRT_LOGGER, '') with open(args.engine, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) print("Loaded engine:{}".format(args.engine)) context = engine.create_execution_context() context.debug_sync = True context.active_optimization_profile = 0 print("Active Optimization Profile:{}".format( context.active_optimization_profile)) input_binding_idxs, output_binding_idxs = get_binding_idxs( engine, context.active_optimization_profile) input_names = [ engine.get_binding_name(binding_idx) for binding_idx in input_binding_idxs ] output_names = [ engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs ] # Allocate device memory for inputs. This can be easily re-used if the # input shapes don't change host_inputs = [ source_image, morpher_params[0], base_grid, rotator_params[0], rotator_params[0] ] host_inputs_buffers = [ cuda.pagelocked_empty_like(i) for i in host_inputs ] [np.copyto(i, j) for i, j in zip(host_inputs_buffers, host_inputs)] device_inputs = [ cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs ] input_binding_idxs, output_binding_idxs = get_binding_idxs( engine, context.active_optimization_profile) input_names = [ engine.get_binding_name(binding_idx) for binding_idx in input_binding_idxs ] output_names = [ engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs ] for h_input, d_input in zip(host_inputs_buffers, device_inputs): cuda.memcpy_htod(d_input, h_input) host_outputs, device_outputs = setup_binding_shapes( engine, context, host_inputs, input_binding_idxs, output_binding_idxs) print("Input Metadata") print("\tNumber of Inputs: {}".format(len(input_binding_idxs))) print("\tInput Bindings for Profile {}: {}".format( context.active_optimization_profile, input_binding_idxs)) print("\tInput names: {}".format(input_names)) print("\tInput shapes: {}".format([inp.shape for inp in host_inputs])) print("Output Metadata") print("\tNumber of Outputs: {}".format(len(output_binding_idxs))) print("\tOutput names: {}".format(output_names)) print("\tOutput shapes: {}".format([out.shape for out in host_outputs])) print("\tOutput Bindings for Profile {}: {}".format( context.active_optimization_profile, output_binding_idxs)) stream = cuda.Stream() for i in tqdm(range(args.num_params)): if i > 0: host_outputs, device_outputs = setup_binding_shapes( engine, context, host_inputs, input_binding_idxs, output_binding_idxs) [ np.copyto(host_inputs_buffers[1], morpher_params[i]), np.copyto(host_inputs_buffers[3], rotator_params[i]), np.copyto(host_inputs_buffers[4], rotator_params[i]) ] cuda.memcpy_htod_async(device_inputs[1], host_inputs_buffers[1], stream) cuda.memcpy_htod_async(device_inputs[3], host_inputs_buffers[3], stream) cuda.memcpy_htod_async(device_inputs[4], host_inputs_buffers[4], stream) # Bindings are a list of device pointers for inputs and outputs bindings = device_inputs + device_outputs bindings = [int(binding) for binding in bindings] # Inference context.execute_async_v2(bindings, stream.handle) # Copy outputs back to host to view results cuda.memcpy_dtoh_async(host_outputs[-1], device_outputs[-1], stream) stream.synchronize() # View outputs combined = np.reshape(host_outputs[-1], (4, args.size[-2], args.size[-1])) frames.append(combined) del context del engine print("\n Generating GIF ....") final_frames = [rgba_to_numpy_image(i) for i in frames] pil_images = [ Image.fromarray(np.uint8(np.rint(f * 255.0)), mode='RGBA') for f in final_frames ] pil_images[0].save(args.output, save_all=True, append_images=pil_images[1:], optimize=False, duration=200, loop=1, transparency=255, disposal=2) print("\n Done !") if args.store_frames: if not os.path.isdir("results/frames"): os.mkdir("results/frames") [ pil_image.save("results/frames/result_{}.png".format(i)) for i, pil_image in enumerate(pil_images) ]
def kmeans_gpu(data, clusters, iterations, return_times = 0): # kmeans_gpu(data, clusters, iterations) returns (clusters, labels) # kmeans using standard algorithm and cuda # input arguments are the data, intial cluster values, and number of iterations to repeat # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and # nPts = number of data points # The shape of clusters is (nDim, nClusters) # # The return values are the updated clusters and labels for the data #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if(nPts > 32768): useTextureForData = 0 # block and grid sizes for the cluster_assign kernel threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16) blocksize_assign = min(256, threads_desired) gridsize_assign = 1 + (threads_desired - 1)/blocksize_assign """ print "\nblocksize_assign =", blocksize_assign print "gridsize_assign =", gridsize_assign """ # block and grid sizes for the cluster_calc kernel blocksize_calc = 2 while(blocksize_calc < min(512, nPts)): blocksize_calc *= 2 maxblocks = 512 seqcount_calc = 1 + (nPts-1)/(blocksize_calc * maxblocks) gridsize_calc = 1 + (nPts-1)/(seqcount_calc * blocksize_calc) """ print "blocksize_calc =", blocksize_calc print "gridsize_calc =", gridsize_calc print "seqcount_calc =", seqcount_calc """ blocksize_calc_part2 = 1 while(blocksize_calc_part2 < gridsize_calc): blocksize_calc_part2 *= 2 #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, BOUNDS) cuda_assign = mod_cuda.get_function("assign") cuda_calc = mod_cuda.get_function("calc") cuda_calc_part2 = mod_cuda.get_function("calc_part2") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2-t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_cuda.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:,:] = data; gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:,:] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts,), np.int32) gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32); gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_calc), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_calc,), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2-t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- assign_time = 0. calc_time = 0. for i in range(iterations): # assign data to clusters t1 = time.time() if useTextureForData: cuda_assign(gpu_clusters, gpu_assignments, block = (blocksize_assign, 1, 1), grid = (gridsize_assign, 1), texrefs=[texrefData]) else: cuda_assign(gpu_data, gpu_clusters, gpu_assignments, block = (blocksize_assign, 1, 1), grid = (gridsize_assign, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() assign_time += t2-t1 # calculate new cluster centers t1 = time.time() if useTextureForData: cuda_calc(gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_calc, 1, 1), grid = (gridsize_calc, nDim), texrefs=[texrefData]) else: cuda_calc(gpu_data, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_calc, 1, 1), grid = (gridsize_calc, nDim)) cuda_calc_part2(gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block = (blocksize_calc_part2, 1, 1), grid = (1, nDim)) pycuda.autoinit.context.synchronize() t2 = time.time() calc_time += t2-t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_assignments, gpu_clusters.get(), \ data_time, module_time, assign_time/iterations, calc_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def MedianFilter(input=None, kernel_size=3, bw=32, bh=32): #s = cuda.Event() #e = cuda.Event() input_list = input BLOCK_WIDTH = bw BLOCK_HEIGHT = bh if isinstance(kernel_size, (int, long)): kernel_size = [kernel_size]*2 WS_x, WS_y = kernel_size padding_y = WS_x/2 padding_x = WS_y/2 input_list = np.asarray(input_list) if input_list.ndim == 3: _, N, M = input_list.shape elif input_list.ndim == 2: N, M = input_list.shape input_list = [input_list] expanded_N = N + (2 * padding_y) expanded_M = M + (2 * padding_x) gridx = int(np.ceil((expanded_N)/BLOCK_WIDTH))+1 gridy = int(np.ceil((expanded_M)/BLOCK_HEIGHT))+1 grid = (gridx,gridy, 1) block = (BLOCK_WIDTH, BLOCK_HEIGHT, 1) code = """ #pragma comment(linker, "/HEAP:4000000") /* Some sample C code for the quickselect algorithm, taken from Numerical Recipes in C. */ #define SWAP(a,b) temp=(a);(a)=(b);(b)=temp; __device__ float quickselect(float *arr, int n, int k) { unsigned long i,ir,j,l,mid; float a,temp; l=0; ir=n-1; for(;;) { if (ir <= l+1) { if (ir == l+1 && arr[ir] < arr[l]) { SWAP(arr[l],arr[ir]); } return arr[k]; } else { mid=(l+ir) >> 1; SWAP(arr[mid],arr[l+1]); if (arr[l] > arr[ir]) { SWAP(arr[l],arr[ir]); } if (arr[l+1] > arr[ir]) { SWAP(arr[l+1],arr[ir]); } if (arr[l] > arr[l+1]) { SWAP(arr[l],arr[l+1]); } i=l+1; j=ir; a=arr[l+1]; for (;;) { do i++; while (arr[i] < a); do j--; while (arr[j] > a); if (j < i) break; SWAP(arr[i],arr[j]); } arr[l+1]=arr[j]; arr[j]=a; if (j >= k) ir=j-1; if (j <= k) l=i; } } } /* https://softwareengineering.stackexchange.com/questions/284767/kth-selection-routine-floyd-algorithm-489 * Implementation from Stack Exchange user: Andy Dansby */ __device__ float FloydWirth_kth(float arr[], const int kTHvalue) { #define F_SWAP(a,b) { float temp=(a);(a)=(b);(b)=temp; } #define SIGNUM(x) ((x) < 0 ? -1 : ((x) > 0 ? 1 : (x))) int left = 0; int right = %(WS^2)s - 1; int left2 = 0; int right2 = %(WS^2)s - 1; while (left < right) { if( arr[right2] < arr[left2] ) F_SWAP(arr[left2],arr[right2]); if( arr[right2] < arr[kTHvalue] ) F_SWAP(arr[kTHvalue],arr[right2]); if( arr[kTHvalue] < arr[left2] ) F_SWAP(arr[left2],arr[kTHvalue]); int rightleft = right - left; if (rightleft < kTHvalue) { int n = right - left + 1; int ii = kTHvalue - left + 1; int s = (n + n) / 3; int sd = (n * s * (n - s) / n) * SIGNUM(ii - n / 2); int left2 = max(left, kTHvalue - ii * s / n + sd); int right2 = min(right, kTHvalue + (n - ii) * s / n + sd); } float x=arr[kTHvalue]; while ((right2 > kTHvalue) && (left2 < kTHvalue)) { do { left2++; }while (arr[left2] < x); do { right2--; }while (arr[right2] > x); F_SWAP(arr[left2],arr[right2]); } left2++; right2--; if (right2 < kTHvalue) { while (arr[left2]<x) { left2++; } left = left2; right2 = right; } if (kTHvalue < left2) { while (x < arr[right2]) { right2--; } right = right2; left2 = left; } if( arr[left] < arr[right] ) F_SWAP(arr[right],arr[left]); } #undef F_SWAP #undef SIGNUM return arr[kTHvalue]; } texture<float, 2> tex; __global__ void mf(float* in, float* out, int imgDimY, int imgDimX) { float window[%(WS^2)s]; int x_thread_offset = %(BY)s * blockIdx.x + threadIdx.x; int y_thread_offset = %(BX)s * blockIdx.y + threadIdx.y; for (int y = %(WSx/2)s + y_thread_offset; y < imgDimX - %(WSx/2)s; y += %(y_stride)s) { for (int x = %(WSy/2)s + x_thread_offset; x < imgDimY - %(WSy/2)s; x += %(x_stride)s) { int i = 0; for (int fx = 0; fx < %(WSy)s; ++fx) { for (int fy = 0; fy < %(WSx)s; ++fy) { //window[i] = tex2D(tex, (float) (x + fx - %(WSy/2)s), (float) (y + fy - %(WSx/2)s)); window[i] = in[(x + fx - %(WSy/2)s) + (y + fy - %(WSx/2)s)*imgDimY]; i += 1; } } // Sort to find the median //for (int j = 0; j < %(WS^2)s/2 + 1; j++) //{ // for (int k = j + 1; k < %(WS^2)s; k++) // { // if (window[j] > window[k]) // { // float tmp = window[j]; // window[j] = window[k]; // window[k] = tmp; // } // } //} //out[y*imgDimY + x] = window[%(WS^2)s/2]; out[y*imgDimY + x] = FloydWirth_kth(window, %(WS^2)s/2); //out[y*imgDimY + x] = quickselect(window, %(WS^2)s, %(WS^2)s/2); } } } __global__ void mf_shared(float *in, float* out, int imgDimY, int imgDimX) { const int TSx = %(BX)s + %(WSx)s - 1; const int TSy = %(BY)s + %(WSy)s - 1; __shared__ float tile[TSx][TSy]; float window[%(WS^2)s]; const int x_thread_offset = %(BX)s * blockIdx.x + threadIdx.x; const int y_thread_offset = %(BY)s * blockIdx.y + threadIdx.y; const int thread_index = blockDim.y * threadIdx.x + threadIdx.y; int imgX = blockIdx.x * blockDim.x + thread_index; int imgY; // Load into the tile for this block if (thread_index < TSx && imgX < imgDimX) { for (int i = 0; i < TSy && i < imgDimY - blockIdx.y * blockDim.y; i++) { imgY = blockIdx.y * blockDim.y + i; tile[thread_index][i] = in[imgX * imgDimY + imgY]; //tile[thread_index][i] = tex2D(tex, (float) imgY, (float) imgX); } } __syncthreads(); int x = %(WSx/2)s + x_thread_offset; int y = %(WSy/2)s + y_thread_offset; if (x >= imgDimX - %(WSx/2)s || y >= imgDimY - %(WSy/2)s) { return; } int i = 0; for (int fx = 0; fx < %(WSx)s; ++fx) { for (int fy = 0; fy < %(WSy)s; ++fy) { window[i++] = tile[threadIdx.x + fx][threadIdx.y + fy]; } } // Sort to find the median //for (int j = 0; j <= %(WS^2)s/2; j++) //{ // for (int k = j + 1; k < %(WS^2)s; k++) // { // if (window[j] > window[k]) // { // float tmp = window[j]; // window[j] = window[k]; // window[k] = tmp; // } // } //} //out[x*imgDimY + y] = window[%(WS^2)s/2]; out[x*imgDimY + y] = FloydWirth_kth(window, %(WS^2)s/2); //forgetfulSelection(window, %(WSx)s); //out[x*imgDimY + y] = window[%(WS^2)s/2]; //out[x*imgDimY + y] = myForgetfulSelection(window); } """ code = code % { 'BY' : BLOCK_WIDTH, 'BX' : BLOCK_HEIGHT, 'WS^2' : WS_x * WS_y, 'x_stride' : BLOCK_WIDTH * gridx, 'y_stride' : BLOCK_HEIGHT * gridy, 'WSx' : WS_x, 'WSy' : WS_y, 'WSx/2' : WS_x/2, 'WSy/2' : WS_y/2, } mod = SourceModule(code) #mf_shared = mod.get_function('mf_shared') mf = mod.get_function('mf') texref = mod.get_texref("tex") # NSTREAMS := NUMBER OF INPUT IMAGES nStreams = len(input_list) # Initialize the streams stream = [cuda.Stream()]*nStreams # Pad all the images with zeros input_list = [np.array( np.pad(img, ( (padding_y, padding_y), (padding_x, padding_x) ), 'constant', constant_values=0) , dtype=np.float32) for img in input_list] # Use pinned memory for all the images in_pin_list = [cuda.register_host_memory(img) for img in input_list] imgBytes = in_pin_list[0].nbytes # Initialize the outputs to empty images (assuming all images are of the same shape) outdata_list = [cuda.pagelocked_empty_like(img) for img in input_list] # Malloc on the GPU for each input and output image #in_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list] in_gpu_list = [None]*nStreams #out_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list] out_gpu_list = [None]*nStreams mf.prepare("PPii") for i in xrange(nStreams + 2): ii = i - 1 iii = i - 2 if 0 <= iii < nStreams: st = stream[iii] cuda.memcpy_dtoh_async(outdata_list[iii], out_gpu_list[iii], stream=st) if 0 <= ii < nStreams: st = stream[ii] out_gpu_list[ii] = cuda.mem_alloc(imgBytes) # s.record(stream=stream[0]) # mf_shared.prepare("Pii") # mf_shared.prepared_async_call(grid, block, st, out_gpu_list[ii], expanded_M, expanded_N) #mf.prepare("PPii") mf.prepared_async_call(grid, block, st, in_gpu_list[ii], out_gpu_list[ii], expanded_M, expanded_N) # e.record(stream=stream[0]) # e.synchronize() # print s.time_till(e), "ms for the kernel" if 0 <= i < nStreams: st = stream[i] #cuda.matrix_to_texref(in_pin_list[i], texref, order="C") in_gpu_list[i] = cuda.mem_alloc(imgBytes) cuda.memcpy_htod_async(in_gpu_list[i], in_pin_list[i], stream=st) if (padding_y > 0): outdata_list = [out[padding_y:-padding_y] for out in outdata_list] if (padding_x > 0): outdata_list = [out[:, padding_x:-padding_x] for out in outdata_list] return outdata_list
def kmeans_gpu(data, clusters, iterations, return_times=0): # kmeans_gpu(data, clusters, iterations) returns (clusters, labels) # kmeans using standard algorithm and cuda # input arguments are the data, intial cluster values, and number of iterations to repeat # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and # nPts = number of data points # The shape of clusters is (nDim, nClusters) # # The return values are the updated clusters and labels for the data #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if (nPts > 32768): useTextureForData = 0 # block and grid sizes for the cluster_assign kernel threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16) blocksize_assign = min(256, threads_desired) gridsize_assign = 1 + (threads_desired - 1) / blocksize_assign """ print "\nblocksize_assign =", blocksize_assign print "gridsize_assign =", gridsize_assign """ # block and grid sizes for the cluster_calc kernel blocksize_calc = 2 while (blocksize_calc < min(512, nPts)): blocksize_calc *= 2 maxblocks = 512 seqcount_calc = 1 + (nPts - 1) / (blocksize_calc * maxblocks) gridsize_calc = 1 + (nPts - 1) / (seqcount_calc * blocksize_calc) """ print "blocksize_calc =", blocksize_calc print "gridsize_calc =", gridsize_calc print "seqcount_calc =", seqcount_calc """ blocksize_calc_part2 = 1 while (blocksize_calc_part2 < gridsize_calc): blocksize_calc_part2 *= 2 #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, BOUNDS) cuda_assign = mod_cuda.get_function("assign") cuda_calc = mod_cuda.get_function("calc") cuda_calc_part2 = mod_cuda.get_function("calc_part2") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2 - t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_cuda.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:, :] = data gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:, :] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts, ), np.int32) gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32) gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_calc), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_calc, ), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2 - t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- assign_time = 0. calc_time = 0. for i in range(iterations): # assign data to clusters t1 = time.time() if useTextureForData: cuda_assign(gpu_clusters, gpu_assignments, block=(blocksize_assign, 1, 1), grid=(gridsize_assign, 1), texrefs=[texrefData]) else: cuda_assign(gpu_data, gpu_clusters, gpu_assignments, block=(blocksize_assign, 1, 1), grid=(gridsize_assign, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() assign_time += t2 - t1 # calculate new cluster centers t1 = time.time() if useTextureForData: cuda_calc(gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_calc, 1, 1), grid=(gridsize_calc, nDim), texrefs=[texrefData]) else: cuda_calc(gpu_data, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_calc, 1, 1), grid=(gridsize_calc, nDim)) cuda_calc_part2(gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block=(blocksize_calc_part2, 1, 1), grid=(1, nDim)) pycuda.autoinit.context.synchronize() t2 = time.time() calc_time += t2 - t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_assignments, gpu_clusters.get(), \ data_time, module_time, assign_time/iterations, calc_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()