def benchmark(self, func, gpu_args, threads, grid): """runs the kernel and measures time repeatedly, returns average time Runs the kernel and measures kernel execution time repeatedly, number of iterations is set during the creation of CudaFunctions. Benchmark returns a robust average, from all measurements the fastest and slowest runs are discarded and the rest is included in the returned average. The reason for this is to be robust against initialization artifacts and other exceptional cases. :param func: A PyCuda kernel compiled for this specific kernel configuration :type func: pycuda.driver.Function :param gpu_args: A list of arguments to the kernel, order should match the order in the code. Allowed values are either variables in global memory or single values passed by value. :type gpu_args: list( pycuda.driver.DeviceAllocation, numpy.int32, ...) :param threads: A tuple listing the number of threads in each dimension of the thread block :type threads: tuple(int, int, int) :param grid: A tuple listing the number of thread blocks in each dimension of the grid :type grid: tuple(int, int) :returns: A dictionary with benchmark results. :rtype: dict() """ result = dict() err = cudart.cudaDeviceSynchronize() for _ in range(self.iterations): for obs in self.observers: obs.before_start() err = cudart.cudaDeviceSynchronize() err = cudart.cudaEventRecord(self.start, self.stream) self.run_kernel(func, gpu_args, threads, grid, stream=self.stream) err = cudart.cudaEventRecord(self.end, self.stream) for obs in self.observers: obs.after_start() while cudart.cudaEventQuery( self.end) != cuda.CUresult.CUDA_SUCCESS: for obs in self.observers: obs.during() time.sleep(1e-6) for obs in self.observers: obs.after_finish() for obs in self.observers: result.update(obs.get_results()) return result
# import numpy as np from cuda import cudart import tensorrt as trt nIn, cIn, hIn, wIn = 1, 1, 6, 9 cOut, hW, wW = 1, 3, 3 data = np.tile( np.arange(1, 1 + hW * wW, dtype=np.float32).reshape(hW, wW), (cIn, hIn // hW, wIn // wW)).reshape(1, cIn, hIn, wIn) weight = np.full([cOut, hW, wW], 1, dtype=np.float32) bias = np.zeros(cOut, dtype=np.float32) np.set_printoptions(precision=8, linewidth=200, suppress=True) cudart.cudaDeviceSynchronize() logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.INT8) # 需要打开 int8 模式 inputT0 = network.add_input('inputT0', trt.float32, (nIn, cIn, hIn, wIn)) qValue = 1 / 1 qTensor = network.add_constant([], np.array([qValue], dtype=np.float32)).get_output(0) inputQLayer = network.add_quantize(inputT0, qTensor) inputQLayer.axis = 0 inputQDQLayer = network.add_dequantize(inputQLayer.get_output(0), qTensor)