def set(self, ary, device=None): """ copy host array to device. Arguments: ary: host array, needs to be contiguous device: device id, if not the one attached to current context Returns: self """ stream = self.backend.stream assert ary.size == self.size assert self.is_contiguous, "Array in set() must be contiguous" if ary.dtype is not self.dtype: ary = ary.astype(self.dtype) assert ary.strides == self.strides if device is None: drv.memcpy_htod_async(self.gpudata, ary, stream) else: # with multithreaded datasets, make a context before copying # and destroy it again once done. lctx = drv.Device(device).make_context() drv.memcpy_htod_async(self.gpudata, ary, stream) lctx.pop() del lctx return self
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{ """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*, and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude vector; else, the magnitude vector will be computed (on the GPU) from the count matrix. Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles. """ # Set up lingo and count matrices on device #{{{ if self.usePycudaArray: # Set up using PyCUDA CUDAArray support self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C') self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C') self.gpu.tex2lr.set_array(self.gpu.rsmiles) self.gpu.tex2cr.set_array(self.gpu.rcounts) else: # Manually handle setup temprlmat = self._padded_array(refsmilesmat) if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768: raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape) self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes) cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream) temprcmat = self._padded_array(refcountsmat) self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes) cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream) descriptor = cuda.ArrayDescriptor() descriptor.width = temprcmat.shape[1] descriptor.height = temprcmat.shape[0] descriptor.format = cuda.array_format.UNSIGNED_INT32 descriptor.num_channels = 1 self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0]) self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0]) self.gpu.stream.synchronize() del temprlmat del temprcmat #}}} self.rlengths = reflengths self.rshape = refsmilesmat.shape self.nref = refsmilesmat.shape[0] # Copy reference lengths to GPU self.gpu.rl_gpu = cuda.to_device(reflengths) # Allocate buffers for query set magnitudes self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes) if refmags is not None: cuda.memcpy_htod(self.gpu.rmag_gpu,refmags) else: # Calculate query set magnitudes on GPU magthreads = 256 self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr]) return
def set_async(self, ary, stream=None): assert ary.size == self.size assert ary.dtype == self.dtype assert self.flags.forc if not ary.flags.forc: raise RuntimeError("cannot asynchronously set from " "non-contiguous array") if self.size: drv.memcpy_htod_async(self.gpudata, ary, stream)
def set(self, tensor, data): assert isinstance(tensor, MGPUTensor) if tensor.ptype == 'replica': for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs): ctx.push() drv.memcpy_htod_async(dest.ptr, data, strm) ctx.pop() # tensor.copy_from(data) else: self.scatter(data, tensor)
def pre_execution(self, solver, stream=None): super(RimeEBeam, self).pre_execution(solver,stream) if stream is not None: cuda.memcpy_htod_async( self.rime_const_data[0], solver.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( self.rime_const_data[0], solver.const_data().ndary())
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2): ctx1 = cuda.Device(dev1).make_context() a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32) ctx1.pop() ctx2 = cuda.Device(dev2).make_context() cuda.memcpy_htod(int(b_gpu), a) b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32) ctx2.pop() ctx1 = cuda.Device(dev1).make_context() cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b) ctx1.pop()
def kernel_write(function_name, dest_devptr, dest_info, source_devptr, source_info, work_range, stream=None): global KD # initialize variables global tb_cnt tb_cnt = 0 # dest cuda_args = [dest_devptr] cuda_args += [dest_info] # source cuda_args += [source_devptr] cuda_args += [source_info] # work_range cuda_args += make_cuda_list(work_range) # initialize model view eye = numpy.eye(4,dtype=numpy.float32) cuda.memcpy_htod_async(mmtx, eye, stream=stream) cuda.memcpy_htod_async(inv_mmtx, eye, stream=stream) try: if Debug: print "Function name: ", function_name func = mod.get_function(function_name) #cutting function except: print "Function not found ERROR" print "Function name: ", function_name assert(False) # set work range block, grid = range_to_block_grid(work_range) if log_type in ['time', 'all']: st = time.time() func(*cuda_args, block=block, grid=grid, stream=stream) #ctx.synchronize() KD.append((dest_info, source_info)) if log_type in ['time', 'all']: bytes = make_bytes(work_range,3) t = MPI.Wtime()-st ms = 1000*t bw = bytes/GIGA/t log("rank%d, GPU%d, , kernel write time, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec "%(rank, device_number, bytes/MEGA, ms, bw),'time', log_type)
def set_async(self, ary, stream=None): assert ary.ndim <= 3 assert ary.dtype == ary.dtype assert ary.size == self.size if ary.base.__class__ != cuda.HostAllocation: raise TypeError("asynchronous memory trasfer requires pagelocked numpy array") if self.size: if self.M == 1: cuda.memcpy_htod_async(self.gpudata, ary, stream) else: PitchTrans(self.shape, self.gpudata, self.ld, ary, _pd(self.shape), self.dtype, async = True, stream = stream)
def synchronize_isdone(self): """ Complete synchronization process. """ # Use shorter, easier names for class variables. bufs = self._sync_buffers ptrs = self._sync_ptrs streams = self._sync_streams adj = self._sync_adj part2_start = self._sync_part2_start is_done = [False, False, False, False] # Forward send. if streams[0].is_done(): # Device-to-host copy completed. if not part2_start[0]: # Initialize MPI send. comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0]) part2_start[0] = True is_done[0] = True else: # No more work to do. is_done[0] = True # Backward send. if streams[1].is_done(): # Device-to-host copy completed. if not part2_start[1]: # Initialize MPI send. comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1]) part2_start[1] = True is_done[1] = True else: # No more work to do. is_done[1] = True # Forward receive. if self._sync_req_forw.Test(): # MPI receive completed. if not part2_start[2]: # Initialize host-to-device copy. drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \ stream=streams[2]) # Host-to-device. part2_start[2] = True elif streams[2].is_done(): # Host-to-device copy completed. is_done[2] = True # Backward receive. if self._sync_req_back.Test(): # MPI receive completed. if not part2_start[3]: # Initialize host-to-device copy. drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \ stream=streams[3]) # Host-to-device. part2_start[3] = True elif streams[3].is_done(): # Host-to-device copy completed. is_done[3] = True # print '~', is_done[0:4], # Return true only when all four transfers are complete. return all(is_done)
def load_data_on_gpu(tl_args, module): d_V = module.get_global('d_V')[0] cuda.memcpy_htod_async(d_V, tl_args.V) d_c = module.get_global('d_c')[0] cuda.memcpy_htod_async(d_c, tl_args.c) d_I = module.get_global('d_I')[0] cuda.memcpy_htod_async(d_I, tl_args.I) d_E = module.get_global('d_E')[0] cuda.memcpy_htod_async(d_E, tl_args.E) d_x_0 = module.get_global('d_x_0')[0] cuda.memcpy_htod_async(d_x_0, tl_args.x_0)
def execute(self, solver, stream=None): slvr = solver if stream is not None: cuda.memcpy_htod_async( self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( self.rime_const_data[0], slvr.const_data().ndary()) self.kernel(slvr.uvw, slvr.lm, slvr.frequency, slvr.B_sqrt, slvr.jones, stream=stream, **self.launch_params)
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def set_async(self, ary, stream=None): assert ary.size == self.size assert ary.dtype == self.dtype if ary.strides != self.strides: from warnings import warn warn("Setting array from one with different strides/storage order. " "This will cease to work in 2013.x.", stacklevel=2) assert self.flags.forc if not ary.flags.forc: raise RuntimeError("cannot asynchronously set from " "non-contiguous array") if self.size: drv.memcpy_htod_async(self.gpudata, ary, stream)
def run(self, scomp, scopy): # If we are unpacking then copy the host buffer to the GPU if op == 'unpack': cuda.memcpy_htod_async(m.data, m.hdata, scopy) event.record(scopy) scomp.wait_for_event(event) # Call the CUDA kernel (pack or unpack) fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol, v.mapping, v.strides, m, v.mapping.leaddim, v.strides.leaddim, m.leaddim) # If we have been packing then copy the GPU buffer to the host if op == 'pack': event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
def set(self, ary): """ copy host array to device. Arguments: ary: host array, needs to be contiguous Returns: self """ stream = self.backend.stream assert ary.size == self.size assert self.is_contiguous, "Array in set() must be contiguous" if ary.dtype is not self.dtype: ary = ary.astype(self.dtype) assert ary.strides == tuple(self.dtype.itemsize*s for s in self.strides) drv.memcpy_htod_async(self.gpudata, ary, stream) return self
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim),), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def execute(self, solver, stream=None): slvr = solver if stream is not None: cuda.memcpy_htod_async( self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( self.rime_const_data[0], slvr.const_data().ndary()) self.kernel(slvr.lm, slvr.parallactic_angles, slvr.point_errors, slvr.antenna_scaling, slvr.frequency, slvr.E_beam, slvr.jones, slvr.beam_ll, slvr.beam_lm, slvr.beam_lfreq, slvr.beam_ul, slvr.beam_um, slvr.beam_ufreq, stream=stream, **self.launch_params)
def inference(self, img): # copy img to input memory # self.inputs[0]['host'] = np.ascontiguousarray(img) self.inputs[0]['host'] = np.ravel(img) # transfer data to the gpu for inp in self.inputs: cuda.memcpy_htod_async(inp['device'], inp['host'], self.stream) # run inference start = time.time() self.context.execute_async_v2( bindings=self.bindings, stream_handle=self.stream.handle) end = time.time() #print('execution time:', end-start) # fetch outputs from gpu for out in self.outputs: cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream) # synchronize stream self.stream.synchronize() return [out['host'] for out in self.outputs]
def do_inference(context, bindings, inputs, outputs, stream): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs]
def main(): # initialize TensorRT engine and parse ONNX model engine, context = build_engine(ONNX_FILE_PATH) # get sizes of input and output and allocate memory required for input data and for output data for binding in engine: if engine.binding_is_input(binding): # we expect only one input input_shape = engine.get_binding_shape(binding) input_size = trt.volume( input_shape) * engine.max_batch_size * np.dtype( np.float32).itemsize # in bytes device_input = cuda.mem_alloc(input_size) else: # and one output output_shape = engine.get_binding_shape(binding) # create page-locked memory buffers (i.e. won't be swapped to disk) host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32) device_output = cuda.mem_alloc(host_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # preprocess input data host_input = np.array(preprocess_image("input.jpeg").numpy(), dtype=np.float32, order='C') cuda.memcpy_htod_async(device_input, host_input, stream) # run inference context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_output, device_output, stream) stream.synchronize() # postprocess results output_data = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[1]) postprocess(output_data)
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine pics_1 : Input images to the model. h_input_1: Input in the host d_input_1: Input in the device h_output_1: Output in the host d_output_1: Output in the device stream: CUDA stream batch_size : Batch size for execution time height: Height of the output image width: Width of the output image Output: The list of output images """ print('load images to buffer') load_images_to_buffer(pics_1, h_input_1) with engine.create_execution_context() as context: context.debug_sync = False # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. print('load profiler') context.profiler = trt.Profiler() print('execute') context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) print('Transfer predictions back from the GPU.') # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. print(h_output.shape) out = h_output.reshape((1,-1)) return out
def loadONNX2TensorRT(self, filepath): ''' 通过onnx文件,构建TensorRT运行引擎 :param filepath: onnx文件路径 ''' engine = self.ONNX_build_engine(filepath) # 读取测试集 datas = DataLoaders() test_loader = datas.testDataLoader() img, target = next(iter(test_loader)) img = img.numpy() target = target.numpy() img = img.ravel() context = engine.create_execution_context() output = np.empty((100, 10), dtype=np.float32) # 分配内存 d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize) d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] # pycuda操作缓冲区 stream = cuda.Stream() # 将输入数据放入device cuda.memcpy_htod_async(d_input, img, stream) # 执行模型 context.execute_async(100, bindings, stream.handle, None) # 将预测结果从从缓冲区取出 cuda.memcpy_dtoh_async(output, d_output, stream) # 线程同步 stream.synchronize() print("Test Case: " + str(target)) print("Prediction 100: " + str(np.argmax(output, axis=1))) del context del engine
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32,8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=shape+(1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest-a*b) == 0
def trt_inference(stream, trt_ctx, d_input, d_output, input_signal, input_signal_length): print("infer with shape: {}".format(input_signal.shape)) trt_ctx.set_binding_shape(0, input_signal.shape) assert trt_ctx.all_binding_shapes_specified h_output = cuda.pagelocked_empty(tuple(trt_ctx.get_binding_shape(1)), dtype=np.float32) h_input_signal = cuda.register_host_memory( np.ascontiguousarray(input_signal.cpu().numpy().ravel())) cuda.memcpy_htod_async(d_input, h_input_signal, stream) trt_ctx.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() greedy_predictions = torch.tensor(h_output).argmax(dim=-1, keepdim=False) return greedy_predictions
def run_inference(self): """ This function is generalized for multiple inputs/outputs. inputs and outputs are expected to be lists of HostDeviceMem objects. """ # Transfer input data to the GPU. for inp in self.inputs: cuda.memcpy_htod_async(inp.device, inp.host, self.stream) # Run inference. self.context.execute_async( batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.handle, ) # Transfer predictions back from the GPU. for out in self.outputs: cuda.memcpy_dtoh_async(out.host, out.device, self.stream) # Synchronize the stream self.stream.synchronize() # Return only the host outputs. return [out.host for out in self.outputs]
def asy_cpy(a, a_gpu, auto_init_context=True): """Data transfer from host to device. Asynchronous will be enabled when auto_init_context is True, otherwise use normal transfer. """ import pycuda.driver as drv if auto_init_context: strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) # Test correctness # ctx.synchronize() # b= numpy.zeros_like(a, a.dtype) # drv.memcpy_dtoh(b, a_gpu) # print numpy.allclose(a, b) return strm else: drv.memcpy_htod(a_gpu, a)
def scatter(self, hbuf, dbuf): ''' scatters the array data in hbuf to the mgpu tensor assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the number of replicas also assumes that dtype of hbuf and dbuf are the same ''' assert hbuf.size == dbuf.size * dbuf.num_dev assert isinstance(dbuf, MGPUTensor) assert hbuf.dtype == dbuf.dtype ndata = dbuf.size starts = [i * ndata for i in range(self.num_dev)] for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs, starts): src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)] ctx.push() drv.memcpy_htod_async(dest.ptr, src, strm) ctx.pop() self.synchronize()
def test_register_host_memory(self): if drv.get_version() < (4, ): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") import resource a = drv.aligned_empty((2**20, ), np.float64, alignment=resource.getpagesize()) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def execute(self, *inputs): image = inputs[0] np.copyto(self.host_inputs[0], image.ravel()) if self.cuda_ctx: self.cuda_ctx.push() cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1], self.stream) cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() if self.cuda_ctx: self.cuda_ctx.pop() output = self.host_outputs[0] return output
def asy_cpy(a, a_gpu, auto_init_context=True): """Data transfer from host to device. Asynchronous will be enabled when auto_init_context is True, otherwise use normal transfer. """ import pycuda.driver as drv if auto_init_context: strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) # Test correctness #ctx.synchronize() #b= numpy.zeros_like(a, a.dtype) #drv.memcpy_dtoh(b, a_gpu) #print numpy.allclose(a, b) return strm else: drv.memcpy_htod(a_gpu, a)
def _enqueue_const_data_htod(self, subslvr, device_ptr): """ Enqueue an async copy of the constant data array from the sub-solver into the constant memory buffer referenced by the device pointer. """ # Get sub solver constant data array host_ary = subslvr.const_data().ndary() # Allocate pinned memory with same size pinned_ary = subslvr.pinned_mem_pool.allocate( shape=host_ary.shape, dtype=host_ary.dtype) # Copy into pinned memory pinned_ary[:] = host_ary # Enqueue the asynchronous transfer cuda.memcpy_htod_async(device_ptr, pinned_ary, stream=subslvr.stream) return pinned_ary
def inference(self, input_image): # threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.cfx.push() # Do image preprocess # batch_image_raw = [] # batch_origin_h = [] # batch_origin_w = [] # batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) # for i, image_raw in enumerate(raw_image_generator): # input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) # batch_image_raw.append(image_raw) # batch_origin_h.append(origin_h) # batch_origin_w.append(origin_w) # np.copyto(batch_input_image[i], input_image) # input_image = np.ascontiguousarray(input_image) # Copy input image to host buffer self.inputs[0]['host'] = np.ravel(input_image) # Transfer input data to the GPU. for inp in self.inputs: cuda.memcpy_htod_async(inp['device'], inp['host'], self.stream) # Run inference. self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) # Transfer predictions back from the GPU. # fetch outputs from gpu for out in self.outputs: cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream) # Synchronize the stream self.stream.synchronize() # Remove any context from the top of the context stack, deactivating it. self.cfx.pop() # Here we use the first row of output in that batch_size = 1 outputs = [out['host'] for out in self.outputs] reshaped = [] for output, shape in zip(outputs, self.output_shapes): reshaped.append(output.reshape(shape)) # Do postprocess return reshaped
def inference(features): global h_output print("\nRunning Inference...") eval_start_time = time.time() # Copy inputs cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream) cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream) cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream) # Run inference context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() h_output = h_output.transpose((1,0,2,3,4)) eval_time_elapsed = time.time() - eval_start_time print("------------------------") print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed)) print("------------------------") for index, batch in enumerate(h_output): # Data Post-processing start_logits = batch[:, 0] end_logits = batch[:, 1] prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features, start_logits, end_logits, args.n_best_size, args.max_answer_length) print("Processing output {:} in batch".format(index)) print("Answer: '{}'".format(prediction)) print("With probability: {:.3f}%".format(nbest_json[0]['probability'] * 100.0))
def infer(self, raw_image): self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess ori_shape = raw_image.shape batch_input_image = np.empty( shape=[self.batch_size, 3, self.input_h, self.input_w]) input_image = self.preprocess_one(raw_image) np.copyto(batch_input_image, input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream) cuda.memcpy_dtoh_async(host_outputs[2], cuda_outputs[2], stream) # Synchronize the stream stream.synchronize() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs # Do postprocess bboxes = self.post_process(output, ori_shape) return bboxes
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule( """ __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """ ) multiply_them = mod.get_function("multiply_them") shape = (32, 8) a = drv.pagelocked_zeros(shape, dtype=np.float32) b = drv.pagelocked_zeros(shape, dtype=np.float32) a[:] = np.random.randn(*shape) b[:] = np.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest - a * b) == 0
def predict_labels_for_batch(batch_data): global d_inputs, h_d_outputs, h_output, model_bindings, cuda_stream global num_labels, model_classes global trt_context global max_batch_size batch_size = len(batch_data) flat_float_batch = np.ravel(batch_data) begin_time = time.time() cuda.memcpy_htod_async( d_inputs[0], flat_float_batch, cuda_stream) # assuming one input layer for image classification trt_context.execute_async(bindings=model_bindings, batch_size=batch_size, stream_handle=cuda_stream.handle) for output in h_d_outputs: cuda.memcpy_dtoh_async(output['host_mem'], output['dev_mem'], cuda_stream) cuda_stream.synchronize() classification_time = time.time() - begin_time print("[batch of {}] inference={:.2f} ms".format( batch_size, classification_time * 1000)) batch_results = np.split( h_output, max_batch_size) # where each row is a softmax_vector for one sample if model_classes == 1: batch_predicted_labels = batch_results[:batch_size] else: batch_predicted_labels = [ np.argmax(batch_results[k][-num_labels:]) for k in range(batch_size) ] return batch_predicted_labels
def run(nRow, nCol): print("test: nRow=%d,nCol=%d" % (nRow, nCol)) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, nRow, nCol) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() data = np.full((nRow, nCol), 1, dtype=np.float32) # uniform distribution #data = np.tile(np.arange(0,nCol,1,dtype=np.float32),[nRow,1]) # non-uniform distribution inputH0 = np.ascontiguousarray(data.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) outputH1 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) outputD0 = cuda.mem_alloc(outputH0.nbytes) outputD1 = cuda.mem_alloc(outputH1.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) context.execute_async( 1, [int(inputD0), int(outputD0), int(outputD1)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) cuda.memcpy_dtoh_async(outputH1, outputD1, stream) stream.synchronize() print("outputH0") print( np.shape(outputH0), "mean=%.2f,var=%.2f,max=%d,min=%d" % (np.mean(outputH0), np.var(outputH0), np.max(outputH0), np.min(outputH0))) print("outputH1") print(np.shape(outputH1), "mean=%.2f" % (np.mean(outputH1)))
def model_infer(inputs, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream): input_ids = np.asarray(inputs["input_ids"], dtype=np.int32) attention_mask = np.asarray(inputs["attention_mask"], dtype=np.int32) token_type_ids = np.asarray(inputs["token_type_ids"], dtype=np.int32) # Copy inputs cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream) cuda.memcpy_htod_async(d_inputs[1], attention_mask.ravel(), stream) cuda.memcpy_htod_async(d_inputs[2], token_type_ids.ravel(), stream) # start time start_time = time.time() # Run inference context.execute_async(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output0), int(d_output1)], stream_handle=stream.handle) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output0, d_output0, stream) cuda.memcpy_dtoh_async(h_output1, d_output1, stream) # Synchronize the stream and take time stream.synchronize() # end time end_time = time.time() infer_time = end_time - start_time outputs = (h_output0, h_output1) # print(outputs) return outputs, infer_time
def predict(self, input_data): """ predict with async api data -> cpu -> GPU -> cpu :param input_data: :param kwargs: :return: """ if self.pre_processing_fn is not None: input_data = self.pre_processing_fn(input_data) if str(input_data.dtype) != self.input_dtype.__name__: logging.warning( 'dtype of input data:{} is not compilable with engine input:{}, enforcing dtype convertion' .format(str(input_data.dtype), self.input_dtype.__name__)) input_data = self.input_dtype(input_data) # input data -> cpu np.copyto(self.host_input, input_data.ravel()) # cpu -> gpu cuda.memcpy_htod_async(self.cuda_input, self.host_input, self.stream) # Run inference. difference execution api by the way the engine built(implicit/explicit batch size) if self.trt_engine.has_implicit_batch_dimension: self.context.execute_async( bindings=[int(self.cuda_input), int(self.cuda_output)], stream_handle=self.stream.handle) else: self.context.execute_async_v2( bindings=[int(self.cuda_input), int(self.cuda_output)], stream_handle=self.stream.handle) # gpu -> cpu. cuda.memcpy_dtoh_async(self.host_output, self.cuda_output, self.stream) # Synchronize the stream self.stream.synchronize() output = self.host_output if self.post_processing_fn is not None: output = self.post_processing_fn(output) # Return the host output. return output
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] stream.synchronize() return [out.host for out in outputs]
def detect(self, img, conf_th=0.3): """Detect objects in the input image.""" img_resized = _preprocess_trt(img, self.input_shape) np.copyto(self.host_inputs[0], img_resized.ravel()) if self.cuda_ctx: self.cuda_ctx.push() cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1], self.stream) cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() if self.cuda_ctx: self.cuda_ctx.pop() output = self.host_outputs[0] return _postprocess_trt(img, output, conf_th)
def do_inference(context, h_input, d_input, h_output, d_output, stream): # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Run inference. context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Additional test(chenrong06) n = 10 d_in = [] for i in range(n): p = cuda.mem_alloc(h_input.nbytes) cuda.memcpy_htod_async(p, h_input, stream) start = time.clock() context.execute_async(bindings=[int(p), int(d_output)], stream_handle=stream.handle) stream.synchronize() end = time.clock() print("Time used:", end - start)
def run(): logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() inputH0 = np.ascontiguousarray(np.random.rand(height, width).reshape(-1)) inputH1 = np.ascontiguousarray( np.random.rand(8, height, width).reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputD1 = cuda.mem_alloc(inputH1.nbytes) outputH0 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) outputH1 = np.empty(context.get_binding_shape(3), dtype=trt.nptype(engine.get_binding_dtype(3))) outputD0 = cuda.mem_alloc(outputH0.nbytes) outputD1 = cuda.mem_alloc(outputH1.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) stream.synchronize() context.execute_async( 1, [int(inputD0), int(inputD1), int(outputD0), int(outputD1)], stream.handle) stream.synchronize() cuda.memcpy_dtoh_async(outputH0, outputD0, stream) cuda.memcpy_dtoh_async(outputH1, outputD1, stream) stream.synchronize() print(np.shape(outputH0), np.shape(outputH1))
def infer(self, image_raw): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.cfx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings print('ori_shape: ', image_raw.shape) # if image_raw is constant, image_raw.shape[1] != self.input_w w_ori, h_ori = image_raw.shape[1], image_raw.shape[0] # Do image preprocess input_image = self.preprocess_image(image_raw) # Copy input image to host buffer np.copyto(host_inputs[0], input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.cfx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess output = output.reshape(self.input_h, self.input_w).astype('uint8') print('output_shape: ', output.shape) output = cv2.resize(output, (w_ori, h_ori)) return output, end - start
def run(inDim, outDatatype): print("test", inDim, outDatatype) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, outDatatype) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() context.set_binding_shape(0, inDim) context.set_binding_shape(1, inDim[:1]) context.set_binding_shape(2, inDim[:1]) #print("Bind0->",engine.get_binding_shape(0),context.get_binding_shape(0)); #print("Bind1->",engine.get_binding_shape(1),context.get_binding_shape(1)); #print("Bind2->",engine.get_binding_shape(2),context.get_binding_shape(2)); print("All bind:", context.all_binding_shapes_specified) stream = cuda.Stream() data0 = np.full(inDim, 1, dtype=np.float32) data1 = np.random.randint(1, inDim[2], inDim[:1], dtype=np.int32) data2 = np.random.randint(1, inDim[3], inDim[:1], dtype=np.int32) inputH0 = np.ascontiguousarray(data0) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputH1 = np.ascontiguousarray(data1) inputD1 = cuda.mem_alloc(inputH1.nbytes) inputH2 = np.ascontiguousarray(data2) inputD2 = cuda.mem_alloc(inputH2.nbytes) outputH0 = np.empty(context.get_binding_shape(3), dtype=trt.nptype(engine.get_binding_dtype(3))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) cuda.memcpy_htod_async(inputD2, inputH2, stream) context.execute_async_v2( [int(inputD0), int(inputD1), int(inputD2), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() outputH0CPU = mask2DCPU(inputH0, inputH1, inputH2, globalMask2DTrueValue, globalMask2DFalseValue) #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0)) #print(inputH0) #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1)) #print(inputH1) #print("InputH2->",inputH2.shape, engine.get_binding_dtype(2)) #print(inputH2) #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(3)) #print(outputH0) #print("OutputH0CPU->",outputH0CPU.shape) #print(outputH0CPU) print("Check result:", ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
def inference(self, img): """ Detect objects in the input image. Args: img: uint8 numpy array with shape (img_height, img_width, channels) Returns: result: a dictionary contains of [{"id": 0, "bbox": [x1, y1, x2, y2], "score": s% }, {...}, {...}, ...] """ img_resized = self._preprocess_trt(img) # transfer the data to the GPU, run inference and the copy the results back np.copyto(self.host_inputs[0], img_resized.ravel()) # Start inference time t_begin = time.perf_counter() cuda.memcpy_htod_async( self.cuda_inputs[0], self.host_inputs[0], self.stream) self.engine_context.execute_async( batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async( self.host_outputs[1], self.cuda_outputs[1], self.stream) cuda.memcpy_dtoh_async( self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() inference_time = time.perf_counter() - t_begin # Seconds # Calculate Frames rate (fps) self.fps = convert_infr_time_to_fps(inference_time) output = self.host_outputs[0] boxes, scores, classes = self._postprocess_trt(img, output) result = [] for i in range(len(boxes)): # number of boxes if classes[i] == self.class_id + 1: result.append({"id": str(classes[i] - 1) + '-' + str(i), "bbox": boxes[i], "score": scores[i]}) return result
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # 1. Transfer input data to the GPU if need. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # 2. Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # 3. Transfer predictions back from the GPU if need. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # 4. Synchronize the stream stream.synchronize() # 5. Return only the host outputs or only the device outputs return [out.host for out in outputs]
def do_inference(context, bindings, inputs, outputs, stream): # Transfer input data to the GPU. #cuda.memcpy_htod_async(d_input, h_input, stream) print("inputs", inputs[0]) print("device", inputs[0].device) [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] context.execute_async(bindings=bindings, stream_handle=stream.handle) #cuda.memcpy_dtoh_async(h_output, d_output, stream) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] stream.synchronize() # Return only the host outputs. return [out.host for out in outputs]
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): start = time.time() # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. print("engine inference time: %.4f" %(time.time()-start)) return [out.host for out in outputs]
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine. pics_1 : Input images to the model. h_input_1: Input in the host. d_input_1: Input in the device. h_output_1: Output in the host. d_output_1: Output in the device. stream: CUDA stream. batch_size : Batch size for execution time. height: Height of the output image. width: Width of the output image. Output: The list of output images. """ load_images_to_buffer(pics_1, h_input_1) with engine.create_execution_context() as context: # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. context.profiler = trt.Profiler() context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream. stream.synchronize() # Return the host output. out = h_output.reshape((batch_size, 68, 64, 64)) return out
def execute(self, solver, stream=None): slvr = solver if stream is not None: cuda.memcpy_htod_async( self.rime_const_data[0], slvr.const_data().ndary(), stream=stream) else: cuda.memcpy_htod( self.rime_const_data[0], slvr.const_data().ndary()) # The gaussian shape array can be empty if # no gaussian sources were specified. gauss = np.intp(0) if np.product(slvr.gauss_shape.shape) == 0 \ else slvr.gauss_shape sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \ else slvr.sersic_shape self.kernel(slvr.uvw, gauss, sersic, slvr.frequency, slvr.antenna1, slvr.antenna2, slvr.jones, slvr.flag, slvr.weight_vector, slvr.observed_vis, slvr.G_term, slvr.model_vis, slvr.chi_sqrd_result, stream=stream, **self.launch_params) # Call the pycuda reduction kernel. # Divide by the single sigma squared value if a weight vector # is not required. Otherwise the kernel will incorporate the # individual sigma squared values into the sum gpu_sum = gpuarray.sum(slvr.chi_sqrd_result).get() if not slvr.use_weight_vector(): slvr.set_X2(gpu_sum/slvr.sigma_sqrd) else: slvr.set_X2(gpu_sum)
def _copy(self, rdr, gnm): """ Queue a copy of a host genome into a set of device interpolation source buffers. Note that for now, this is broken! It ignores ``gnm``, and only packs the genome that was used when creating the renderer. """ times, knots = rdr.packer.pack(gnm, self.fb.pool) cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a) cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a) palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm["palette"]]) ptimes, pvals = zip(*sorted(palsrc.items())) palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32) palettes[:] = pvals palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32) palette_times.fill(1e9) palette_times[: len(ptimes)] = ptimes cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a) cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times, self.stream_a)
def async_copy(self, dest, src, stream=None): drv.memcpy_htod_async(dest.gpudata, src, stream)
if shape is not None: res = res.reshape(shape) return res def toDevice(self, buf, shape=None, async=False, dest=None): if shape is not None: buf = buf.reshape(shape) if dest is None: if async: # FIXME: there must be a warning in docs that buf has to be pagelocked return gpuarray.to_gpu_async(buf, stream=self.stream) else: return gpuarray.to_gpu(buf) else: cuda.memcpy_htod_async(dest.gpudata, buf, stream=None) def copyBuffer(self, buf, dest=None, src_offset=0, dest_offset=0, length=None): elem_size = buf.dtype.itemsize size = buf.nbytes if length is None else elem_size * length src_offset *= elem_size dest_offset *= elem_size if dest is None: ddest = self.allocate(buf.shape, buf.dtype) else: ddest = dest cuda.memcpy_dtod_async(int(ddest.gpudata) + dest_offset, int(buf.gpudata) + src_offset,
else: drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes) else: # The arrays might be contiguous in the sense of # having no gaps, but the axes could be transposed # so that the order is neither Fortran or C. # So, we attempt to get a contiguous view of dst. dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,)) if async: drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream) else: drv.memcpy_dtoh(dst, src.gpudata) else: src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,)) if async: drv.memcpy_htod_async(dst.gpudata, src, stream=stream) else: drv.memcpy_htod(dst.gpudata, src) return if len(shape) == 2: copy = drv.Memcpy2D() elif len(shape) == 3: copy = drv.Memcpy3D() else: raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),)) if isinstance(src, GPUArray): copy.set_src_device(src.gpudata) else: copy.set_src_host(src)
def set_qsmiles(self,qsmilesmat,qcountsmat,querylengths,querymags=None): #{{{ """Sets the reference SMILES set to use Lingo matrix *qsmilesmat*, count matrix *qcountsmat*, and length vector *querylengths*. If *querymags* is provided, it will be used as the magnitude vector; else, the magnitude vector will be computed (on the GPU) from the count matrix. Because of hardware limitations, the query matrices (*qsmilesmat* and *qcountsmat*) must have no more than 65,536 rows (molecules) and 32,768 columns (Lingos). Larger computations must be performed in tiles. """ # Set up lingo and count matrices on device #{{{ if self.usePycudaArray: # Create CUDAarrays for lingo and count matrices print "Strides qsmilesmat:",numpy.ascontiguousarray(qsmilesmat.T).strides self.gpu.qsmiles = cuda.matrix_to_array(numpy.ascontiguousarray(qsmilesmat.T),order='C') self.gpu.qcounts= cuda.matrix_to_array(numpy.ascontiguousarray(qcountsmat.T),order='C') print "qsmiles descriptor",dtos(self.gpu.qsmiles.get_descriptor()) print "qcounts descriptor",dtos(self.gpu.qcounts.get_descriptor()) self.gpu.tex2lq.set_array(self.gpu.qsmiles) self.gpu.tex2cq.set_array(self.gpu.qcounts) else: # Manually handle texture setup # padded_array will handle making matrix contiguous tempqlmat = self._padded_array(qsmilesmat.T) if tempqlmat.shape[1] > 65536 or tempqlmat.shape[0] > 32768: raise ValueError("Error: query matrix is not allowed to have more than 65536 rows (molecules) or 32768 columns (LINGOs) (both padded to multiple of 16). Dimensions = (%d,%d)"%tempqlmat.shape) if self.gpu.qsmiles is None or self.gpu.qsmiles.nbytes < tempqlmat.nbytes: self.gpu.qsmiles = cuda.mem_alloc(tempqlmat.nbytes) self.gpu.qsmiles.nbytes = tempqlmat.nbytes cuda.memcpy_htod_async(self.gpu.qsmiles,tempqlmat,stream=self.gpu.stream) tempqcmat = self._padded_array(qcountsmat.T) if self.gpu.qcounts is None or self.gpu.qcounts.nbytes < tempqcmat.nbytes: self.gpu.qcounts = cuda.mem_alloc(tempqcmat.nbytes) self.gpu.qcounts.nbytes = tempqcmat.nbytes cuda.memcpy_htod_async(self.gpu.qcounts,tempqcmat,stream=self.gpu.stream) descriptor = cuda.ArrayDescriptor() descriptor.width = tempqcmat.shape[1] descriptor.height = tempqcmat.shape[0] descriptor.format = cuda.array_format.UNSIGNED_INT32 descriptor.num_channels = 1 self.gpu.tex2lq.set_address_2d(self.gpu.qsmiles,descriptor,tempqlmat.strides[0]) self.gpu.tex2cq.set_address_2d(self.gpu.qcounts,descriptor,tempqcmat.strides[0]) #print "Set up query textures with stride=",tempqmat.strides[0] self.gpu.stream.synchronize() del tempqlmat del tempqcmat #}}} self.qshape = qsmilesmat.shape self.nquery = qsmilesmat.shape[0] #print "Query shape=",self.qshape,", nquery=",self.nquery # Transfer query lengths array to GPU self.gpu.ql_gpu = cuda.to_device(querylengths) # Allocate buffers for query set magnitudes self.gpu.qmag_gpu = cuda.mem_alloc(querylengths.nbytes) if querymags is not None: cuda.memcpy_htod(self.gpu.qmag_gpu,querymags) else: # Calculate query set magnitudes on GPU magthreads = 256 self.gpu.qMagKernel(self.gpu.qmag_gpu,self.gpu.ql_gpu,numpy.int32(self.nquery),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cq]) #self.qmag_gpu = cuda.to_device(qcountsmat.sum(1).astype(numpy.int32)) return
def run(self, queue): cuda.memcpy_htod_async(mv.data, mv.hdata, queue.cuda_stream_comp)
def step(self, t_end=0.0, apply_stochastic_term=True, write_now=True): """ Function which steps n timesteps. apply_stochastic_term: Boolean value for whether the stochastic perturbation (if any) should be applied. """ n = int(t_end / self.dt + 1) if self.t == 0: self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0) for i in range(0, n): # Get new random wind direction (emulationg large-scale model error) if(self.max_wind_direction_perturbation > 0.0 and self.wind_stress.type() == 1): # max perturbation +/- max_wind_direction_perturbation deg within original wind direction (at t=0) perturbation = 2.0*(np.random.rand()-0.5) * self.max_wind_direction_perturbation; new_wind_stress = WindStress.GenericUniformWindStress( \ rho_air=self.wind_stress.rho_air, \ wind_speed=self.wind_stress.wind_speed, \ wind_direction=self.wind_stress.wind_direction + perturbation) # Upload new wind stress params to device cuda.memcpy_htod_async(int(self.wind_stress_dev), new_wind_stress.tostruct(), stream=self.gpu_stream) local_dt = np.float32(min(self.dt, t_end-i*self.dt)) if (local_dt <= 0.0): break wind_stress_t = np.float32(self.update_wind_stress(self.kernel, self.cdklm_swe_2D)) #self.bc_kernel.boundaryCondition(self.cl_queue, \ # self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1) # 2nd order Runge Kutta if (self.rk_order == 2): self.callKernel(self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \ self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \ local_dt, wind_stress_t, 0) self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1) self.callKernel(self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \ local_dt, wind_stress_t, 1) self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0) elif (self.rk_order == 1): self.callKernel(self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \ self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \ local_dt, wind_stress_t, 0) self.gpu_data.swap() self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0) # 3rd order RK method: elif (self.rk_order == 3): self.callKernel(self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \ self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \ local_dt, wind_stress_t, 0) self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1) self.callKernel(self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \ local_dt, wind_stress_t, 1) self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1) self.callKernel(self.gpu_data.h1, self.gpu_data.hu1, self.gpu_data.hv1, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0, \ local_dt, wind_stress_t, 2) self.bc_kernel.boundaryCondition(self.gpu_stream, \ self.gpu_data.h0, self.gpu_data.hu0, self.gpu_data.hv0) # Perturb ocean state with model error if self.small_scale_perturbation and apply_stochastic_term: self.small_scale_model_error.perturbSim(self) # Evolve drifters if self.hasDrifters: self.drifters.drift(self.gpu_data.h0, self.gpu_data.hu0, \ self.gpu_data.hv0, \ np.float32(self.constant_equilibrium_depth), \ self.nx, self.ny, self.dx, self.dy, \ local_dt, \ np.int32(2), np.int32(2)) self.t += np.float64(local_dt) self.num_iterations += 1 if self.write_netcdf and write_now: self.sim_writer.writeTimestep(self) return self.t
def to_buf_async(self, cl_buf, stream=None): cuda.memcpy_htod_async(cl_buf, self.buffers[cl_buf], stream)