Exemplo n.º 1
0
    def benchmark(self, func, gpu_args, threads, grid):
        """runs the kernel and measures time repeatedly, returns average time

        Runs the kernel and measures kernel execution time repeatedly, number of
        iterations is set during the creation of CudaFunctions. Benchmark returns
        a robust average, from all measurements the fastest and slowest runs are
        discarded and the rest is included in the returned average. The reason for
        this is to be robust against initialization artifacts and other exceptional
        cases.

        :param func: A PyCuda kernel compiled for this specific kernel configuration
        :type func: pycuda.driver.Function

        :param gpu_args: A list of arguments to the kernel, order should match the
            order in the code. Allowed values are either variables in global memory
            or single values passed by value.
        :type gpu_args: list( pycuda.driver.DeviceAllocation, numpy.int32, ...)

        :param threads: A tuple listing the number of threads in each dimension of
            the thread block
        :type threads: tuple(int, int, int)

        :param grid: A tuple listing the number of thread blocks in each dimension
            of the grid
        :type grid: tuple(int, int)

        :returns: A dictionary with benchmark results.
        :rtype: dict()
        """

        result = dict()
        err = cudart.cudaDeviceSynchronize()
        for _ in range(self.iterations):
            for obs in self.observers:
                obs.before_start()
            err = cudart.cudaDeviceSynchronize()
            err = cudart.cudaEventRecord(self.start, self.stream)
            self.run_kernel(func, gpu_args, threads, grid, stream=self.stream)
            err = cudart.cudaEventRecord(self.end, self.stream)
            for obs in self.observers:
                obs.after_start()
            while cudart.cudaEventQuery(
                    self.end) != cuda.CUresult.CUDA_SUCCESS:
                for obs in self.observers:
                    obs.during()
                time.sleep(1e-6)
            for obs in self.observers:
                obs.after_finish()

        for obs in self.observers:
            result.update(obs.get_results())

        return result
#

import numpy as np
from cuda import cudart
import tensorrt as trt

nIn, cIn, hIn, wIn = 1, 1, 6, 9
cOut, hW, wW = 1, 3, 3
data = np.tile(
    np.arange(1, 1 + hW * wW, dtype=np.float32).reshape(hW, wW),
    (cIn, hIn // hW, wIn // wW)).reshape(1, cIn, hIn, wIn)
weight = np.full([cOut, hW, wW], 1, dtype=np.float32)
bias = np.zeros(cOut, dtype=np.float32)

np.set_printoptions(precision=8, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()

logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
network = builder.create_network(
    1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
config = builder.create_builder_config()
config.flags = 1 << int(trt.BuilderFlag.INT8)  # 需要打开 int8 模式
inputT0 = network.add_input('inputT0', trt.float32, (nIn, cIn, hIn, wIn))

qValue = 1 / 1
qTensor = network.add_constant([], np.array([qValue],
                                            dtype=np.float32)).get_output(0)
inputQLayer = network.add_quantize(inputT0, qTensor)
inputQLayer.axis = 0
inputQDQLayer = network.add_dequantize(inputQLayer.get_output(0), qTensor)