예제 #1
0
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
        onnxFile = onnxFile1  # 已经有 model.plan,读进 model1.onnx 做 Refit
    else:
        onnxFile = onnxFile0  # 还没有 model.plan,先用 model0.onnx 构建 model.plan

    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    config = builder.create_builder_config()
    config.flags = 1 << int(trt.BuilderFlag.REFIT)
    config.max_workspace_size = 3 << 30
    parser = trt.OnnxParser(network, logger)
    if not os.path.exists(onnxFile):
        print("Failed finding .onnx file!")
        exit()
    print("Succeeded finding .onnx file!")
    with open(onnxFile, 'rb') as model:
        if not parser.parse(model.read()):
            print("Failed parsing .onnx file!")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            exit()
        print("Succeeded parsing .onnx file!")

    if os.path.isfile(trtFile):  # 进行 Refit
        refitter = trt.Refitter(engine, logger)
        layerNameList, weightRoleList = refitter.get_all()
        for name, role in zip(layerNameList, weightRoleList):
            print("LayerName:%s,WeightRolw:%s"%(name, role))

        for i in range(network.num_layers):
            layer = network.get_layer(i)        
            if layer.name in layerNameList:
                                
                # 据实际网络情况,可能需要添加更多 Layer
                if layer.type == trt.LayerType.CONVOLUTION:
                    layer.__class__ = trt.IConvolutionLayer            
                    refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel)
                    refitter.set_weights(layer.name, trt.WeightsRole.BIAS, layer.bias)
                    layerNameList.remove
                    
                if layer.type == trt.LayerType.FULLY_CONNECTED:
                    layer.__class__ = trt.IFullyConnectedLayer
                    refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel)

                if layer.type == trt.LayerType.CONSTANT:
                    layer.__class__ = trt.IConstantLayer
                    refitter.set_weights(layer.name, trt.WeightsRole.CONSTANT, layer.weights)
                                                    
        if refitter.refit_cuda_engine() == False:        
            print("Failed refitting engine, missing weight:")
            [missingLayer, weightRole] = refitter.get_missing()
            for layer, role in zip(missingLayer, weightRole):
                print("\tLayerName:%s,WeightRolw:%s"%(name, role))
            return
        print("Succeeded refitting engine!")

    else:  # 构建 model.plan
        inputTensor = network.get_input(0)
        inputTensor.shape = [1, 1, 28, 28]
        '''  # 逐层打印网络信息
        for i in range(network.num_layers):
            layer = network.get_layer(i)        
            print(i, "%s,in=%d,out=%d,%s" % (str(layer.type)[10:], layer.num_inputs, layer.num_outputs, layer.name))
            for j in range(layer.num_inputs):
                tensor = layer.get_input(j)
                if tensor == None:
                    print("\tInput  %2d:" % j, "None")
                else:
                    print("\tInput  %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name))
            for j in range(layer.num_outputs):
                tensor = layer.get_output(j)
                if tensor == None:
                    print("\tOutput %2d:" % j, "None")
                else:
                    print("\tOutput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name))
        '''
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            exit()
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [1, 1, 28, 28])
    _, stream = cudart.cudaStreamCreate()
    print("Binding0->", engine.get_binding_shape(0), context.get_binding_shape(0), engine.get_binding_dtype(0))
    print("Binding1->", engine.get_binding_shape(1), context.get_binding_shape(1), engine.get_binding_dtype(1))

    data = cv2.imread(inputImage, cv2.IMREAD_GRAYSCALE).astype(np.float32)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    print("inputH0 :", data.shape)
    #print(data)
    print("outputH0:", outputH0.shape)
    print(outputH0)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
    print("Succeeded running model in TensorRT!")
예제 #2
0
          engine.get_binding_name(i))

bufferH = []
bufferH.append(np.ascontiguousarray(data.reshape(-1)))
for i in range(nInput, nInput + nOutput):
    bufferH.append(
        np.empty(context.get_binding_shape(i),
                 dtype=trt.nptype(engine.get_binding_dtype(i))))
bufferD = []
for i in range(nInput + nOutput):
    bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for i in range(nInput):
    cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes,
                      cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

context.execute_v2(bufferD)

for i in range(nInput, nInput + nOutput):
    cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes,
                      cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

for i in range(10):
    context.execute_v2(bufferD)

for i in range(nInput + nOutput):
    print(engine.get_binding_name(i))

for b in bufferD:
    cudart.cudaFree(b)
예제 #3
0
def run(useTimeCache):
    logger = trt.Logger(trt.Logger.ERROR)
    timeCache = b""
    if useTimeCache and os.path.isfile(timeCacheFile):
        with open(timeCacheFile, 'rb') as f:
            timeCache = f.read()
        if timeCache == None:
            print("Failed getting serialized timing cache!")
            return
        print("Succeeded getting serialized timing cache!")

    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    profile = builder.create_optimization_profile()
    config = builder.create_builder_config()
    config.max_workspace_size = 6 << 30
    if useTimeCache:
        cache = config.create_timing_cache(timeCache)
        config.set_timing_cache(cache, False)

    inputTensor = network.add_input('inputT0', trt.float32, [-1, 1, 28, 28])
    profile.set_shape(inputTensor.name, (1, 1, 28, 28), (4, 1, 28, 28),
                      (8, 1, 28, 28))
    config.add_optimization_profile(profile)

    np.random.seed(97)  # 保持每次权重都一样
    w = np.random.rand(32, 1, 5, 5).astype(np.float32).reshape(-1)
    b = np.random.rand(32).astype(np.float32).reshape(-1)
    _0 = network.add_convolution_nd(inputTensor, 32, [5, 5], w, b)
    _0.padding_nd = [2, 2]
    _1 = network.add_activation(_0.get_output(0), trt.ActivationType.RELU)
    _2 = network.add_pooling_nd(_1.get_output(0), trt.PoolingType.MAX, [2, 2])
    _2.stride_nd = [2, 2]

    w = np.random.rand(64, 32, 5, 5).astype(np.float32).reshape(-1)
    b = np.random.rand(64).astype(np.float32).reshape(-1)
    _3 = network.add_convolution_nd(_2.get_output(0), 64, [5, 5], w, b)
    _3.padding_nd = [2, 2]
    _4 = network.add_activation(_3.get_output(0), trt.ActivationType.RELU)
    _5 = network.add_pooling_nd(_4.get_output(0), trt.PoolingType.MAX, [2, 2])
    _5.stride_nd = [2, 2]

    _6 = network.add_shuffle(_5.get_output(0))
    _6.first_transpose = (0, 2, 3, 1)
    _6.reshape_dims = (-1, 64 * 7 * 7, 1, 1)

    w = np.random.rand(1024, 64 * 7 * 7).astype(np.float32).reshape(-1)
    b = np.random.rand(1024).astype(np.float32).reshape(-1)
    _7 = network.add_fully_connected(_6.get_output(0), 1024, w, b)
    _8 = network.add_activation(_7.get_output(0), trt.ActivationType.RELU)

    w = np.random.rand(10, 1024).astype(np.float32).reshape(-1)
    b = np.random.rand(10).astype(np.float32).reshape(-1)
    _9 = network.add_fully_connected(_8.get_output(0), 10, w, b)
    _10 = network.add_activation(_9.get_output(0), trt.ActivationType.RELU)

    _11 = network.add_shuffle(_10.get_output(0))
    _11.reshape_dims = [-1, 10]

    _12 = network.add_softmax(_11.get_output(0))
    _12.axes = 1 << 1

    _13 = network.add_topk(_12.get_output(0), trt.TopKOperation.MAX, 1, 1 << 1)

    network.mark_output(_13.get_output(1))

    t0 = time()
    engineString = builder.build_serialized_network(network, config)
    t1 = time()
    print("%s timing cache, %f ms" % ("With" if useTimeCache else "Without",
                                      (t1 - t0) * 1000))

    if useTimeCache and not os.path.isfile(timeCacheFile):
        timeCache = config.get_timing_cache()
        timeCacheString = timeCache.serialize()
        with open(timeCacheFile, 'wb') as f:
            f.write(timeCacheString)
            print("Succeeded saving .cache file!")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [1, 1, 28, 28])
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput

    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(i),
                     dtype=trt.nptype(engine.get_binding_dtype(i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data,
                          bufferH[i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nInput, nInput + nOutput):
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i],
                          bufferH[i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    #for i in range(nInput + nOutput):
    #    print(engine.get_binding_name(i))
    #    print(bufferH[i].reshape(context.get_binding_shape(i)))

    for b in bufferD:
        cudart.cudaFree(b)
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputList = []
        for i in range(nGEMM + 1):
            inputT = network.add_input('inputT' + str(i), trt.float32,
                                       [-1, 4, sizeGEMM, sizeGEMM])
            profile.set_shape(inputT.name, (1, 4, sizeGEMM, sizeGEMM),
                              (4, 4, sizeGEMM, sizeGEMM),
                              (sizeGEMM, 4, sizeGEMM, sizeGEMM))
            inputList.append(inputT)
        config.add_optimization_profile(profile)

        tempTensor = inputList[0]
        for i in range(1, nGEMM + 1):
            tempLayer = network.add_matrix_multiply(tempTensor,
                                                    trt.MatrixOperation.NONE,
                                                    inputList[i],
                                                    trt.MatrixOperation.NONE)
            tempTensor = tempLayer.get_output(0)

        network.mark_output(tempLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    for i in range(nGEMM + 1):
        context.set_binding_shape(i, [4, 4, sizeGEMM, sizeGEMM])
    stream = cudart.cudaStreamCreate()[1]

    bufferSize = [
        trt.volume(context.get_binding_shape(i)) *
        np.array([0], dtype=trt.nptype(engine.get_binding_dtype(i))).nbytes
        for i in range(engine.num_bindings)
    ]

    bufferH = []
    bufferD = []
    for i in range(nGEMM + 2):
        bufferH.append(
            cudart.cudaHostAlloc(bufferSize[i],
                                 cudart.cudaHostAllocWriteCombined)[1])
        bufferD.append(cudart.cudaMallocAsync(bufferSize[i], stream)[1])

    # 不用 CUDA Graph 来执行
    for i in range(nGEMM + 1):
        cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i],
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
    context.execute_async_v2(bufferD, stream)
    cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1],
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    for n in range(nInference):
        for i in range(nGEMM + 1):
            cudart.cudaMemcpyAsync(
                bufferD[i], bufferH[i], bufferSize[i],
                cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
        context.execute_async_v2(bufferD, stream)
        cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1],
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)

    # 捕获 CUDA Graph 并运行
    cudart.cudaStreamBeginCapture(
        stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
    for i in range(nGEMM + 1):
        cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i],
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
    context.execute_async_v2(bufferD, stream)
    cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1],
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    #cudart.cudaStreamSynchronize(stream)                       # 不用在 graph 内同步
    _, graph = cudart.cudaStreamEndCapture(stream)
    _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0)

    cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    for n in range(nInference):
        cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    for i in range(nGEMM + 2):
        cudart.cudaFree(bufferD[i])
    cudart.cudaStreamDestroy(stream)
예제 #5
0
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1])
        profile.set_shape(inputTensor.name, (1, 1, 1), (3, 4, 5), (6, 8, 10))
        config.add_optimization_profile(profile)

        identityLayer = network.add_identity(inputTensor)
        network.mark_output(identityLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [3, 4, 5])
    _, stream = cudart.cudaStreamCreate()

    data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    # 捕获 CUDA Graph 之前需要先运行一次推理
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    # 捕获 CUDA Graph 并运行
    cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    #cudart.cudaStreamSynchronize(stream)  # 不用在 graph 内同步
    _, graph = cudart.cudaStreamEndCapture(stream)
    _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0)

    cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    print("outputH0Big:", outputH0.shape)
    print(outputH0)

    # 输入尺寸改变后,也需要先运行一次推理,再重新捕获 CUDA Graph,最后再运行
    context.set_binding_shape(0, [2, 3, 4])
    inputH0 = np.ascontiguousarray(-data[:2 * 3 * 4].reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    _, graph = cudart.cudaStreamEndCapture(stream)
    _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0)

    cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    print("outputH0Small:", outputH0.shape)
    print(outputH0)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
예제 #6
0
def run(nRunTime):
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        config = builder.create_builder_config()
        config.flags = 1 << int(trt.BuilderFlag.REFIT)

        inputT0 = network.add_input('inputT0', trt.float32,
                                    (nIn, cIn, hIn, wIn))
        fakeWeight = np.zeros([cOut, cIn, wW, wW], dtype=np.float32)
        fakeBias = np.zeros([cOut], dtype=np.float32)
        convolutionLayer = network.add_convolution_nd(inputT0, cOut, (hW, wW),
                                                      fakeWeight, fakeBias)
        #convolutionLayer.name = 'conv'
        network.set_weights_name(convolutionLayer.kernel, "conv-w")
        network.set_weights_name(convolutionLayer.bias, "conv-b")

        network.mark_output(convolutionLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    if nRunTime == 0:
        print("Do not refit!")
    else:
        print("Refit!")
        refitter = trt.Refitter(engine, logger)
        refitter.set_named_weights("conv-w", weight)
        refitter.set_named_weights("conv-b", bias)

        [missingLayer, weightRole] = refitter.get_missing()
        for layer, role in zip(missingLayer, weightRole):
            print("[", layer, "-", role, "]")

        if refitter.refit_cuda_engine() == False:
            print("Failed Refitting engine!")
            return

    context = engine.create_execution_context()
    _, stream = cudart.cudaStreamCreate()
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                           stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    print("data:", data.shape)
    print(data)
    print("outputH0:", outputH0.shape)
    print(outputH0)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
예제 #7
0
def run1(engine):
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, cIn, hIn, wIn])
    _, stream = cudart.cudaStreamCreate()

    data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape(
        nIn, cIn, hIn, wIn)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    # 完整一次推理
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                           stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    # 数据拷贝 HtoD 计时
    for i in range(10):
        cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)

    trtTimeStart = time()
    for i in range(30):
        cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, DataCopyHtoD" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    # 推理计时
    for i in range(10):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)

    trtTimeStart = time()
    for i in range(30):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, Inference" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    # 数据拷贝 DtoH 计时
    for i in range(10):
        cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)

    trtTimeStart = time()
    for i in range(30):
        cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, DataCopyDtoH" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    # 总时间计时
    for i in range(10):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)

    trtTimeStart = time()
    for i in range(30):
        cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)
        cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, DataCopy + Inference" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
def run(shape):
    testCase = "<shape=%s>" % (shape)
    trtFile = "./model-%d.plan" % (shape[2])
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engineStr = f.read()
            engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr)
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30

        inputT0 = network.add_input('inputT0', trt.float32,
                                    [-1 for i in shape])
        profile.set_shape(inputT0.name, [1, 1, shape[2]], shape, shape)
        config.add_optimization_profile(profile)

        pluginLayer = network.add_plugin_v2([inputT0],
                                            getLayerNormPlugin(epsilon))
        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, shape)
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
    for i in range(nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(nInput + i),
                     dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = layerNormCPU(bufferH[:nInput], epsilon)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
        check1 = check(
            bufferH[indexEncoderOutLens],
            np.sum(ioData['encoder_out_lens'].astype(np.int32), axis=2)[:, 0],
            True)

        string = "%4d,%4d,%8.3f,%9.3e,%9.3e,%9.3e,%9.3e,%9.3e" % (
            batchSize, sequenceLength, timePerInference,
            batchSize * sequenceLength / timePerInference * 1000, check0[1],
            check0[2], check1[1], check1[2])
        print(string + ", %s" %
              ("Good" if check0[1] < 3.5e-2 and check0[2] < 2e-3
               and check1[2] < 1e-1 else "Bad"))
        f.write(string + "\n")

        for i in range(nInput + nOutput):
            cudart.cudaFree(bufferD[i])

#-------------------------------------------------------------------------------
print("Test Decoder Part!")

with open(decoderScoreFile, 'w') as f:

    if os.path.isfile(decoderPlanFile):
        with open(decoderPlanFile, 'rb') as decoderF:
            engine = trt.Runtime(logger).deserialize_cuda_engine(
                decoderF.read())
        if engine is None:
            print("Failed loading %s" % decoderPlanFile)
            exit()
        print("Succeeded loading %s" % decoderPlanFile)
    else:
예제 #10
0
 def __del__(self):
     cudart.cudaFree(self.dIn)
def test_tf_nn_conv2d():
    print(
        "\ntf.nn.conv2d ------------------------------------------------------"
    )
    # TensorFlow part ----------------------------------------------------------
    x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x')
    weight = tf.compat.v1.get_variable(
        'w1',
        shape=[hW, wW, cIn, cOut],
        initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1))
    y       = tf.nn.conv2d( \
                x,
                filter=weight,
                strides=None,
                padding='SAME',
                use_cudnn_on_gpu=True,
                data_format='NHWC',
                dilations=[1, 1, 1, 1],
                name='y',
                filters=None
                )

    tfConfig = tf.compat.v1.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.compat.v1.Session(config=tfConfig)
    sess.run(tf.compat.v1.global_variables_initializer())

    outputTF = sess.run(y, feed_dict={x: inputData})
    tfPara = {}  # 保存权重
    print("Weight:")
    for i in tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES):
        name, value = i.name, sess.run(i)
        print(name, value.shape)
        tfPara[name] = value
    np.savez("para_tf_nn_conv2d.npz", **tfPara)
    sess.close()

    # TensorRT part ------------------------------------------------------------
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    profile = builder.create_optimization_profile()
    config = builder.create_builder_config()
    inputT0 = network.add_input('inputT0', trt.float32, (-1, -1, -1, cIn))
    profile.set_shape(inputT0.name, (1, 1, 1, cIn), (nIn, hIn, wIn, cIn),
                      (nIn * 2, hIn * 2, wIn * 2, cIn))  # 范围覆盖住之后需要的值就好
    config.add_optimization_profile(profile)

    _h1 = network.add_shuffle(inputT0)  # NHWC to NCHW
    _h1.first_transpose = (0, 3, 1, 2)
    weight = np.load('./para_tf_nn_conv2d.npz')['w1:0'].transpose(
        3, 2, 0, 1).reshape(-1)  # 读取权重
    _h2 = network.add_convolution_nd(_h1.get_output(0), cOut, [hW, wW], weight,
                                     None)
    _h2.padding_nd = (2, 2)
    _h3 = network.add_shuffle(_h2.get_output(0))  # NCHW to NHWC,与 TF 模型保持一致
    _h3.first_transpose = (0, 2, 3, 1)

    network.mark_output(_h3.get_output(0))
    engineString = builder.build_serialized_network(network, config)
    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, hIn, wIn, cIn])
    _, stream = cudart.cudaStreamCreate()

    inputH0 = np.ascontiguousarray(inputData.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                           stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    printArray(inputData, "input")
    #print(inputData)
    printArray(outputTF, "TF output")
    #print(outputTF)
    printArray(outputH0, "TRT output")
    #print(outputH0)
    check(outputTF, outputH0, True)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
def run(shape, scalar):
    testCase = "<shape=%s,scalar=%f>" % (shape, scalar)
    trtFile = "./model-Shape[%s].plan" % (
        "".join([str(i) + "-" for i in shape[:-1]]) + str(shape[-1]))
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        builder.max_batch_size = 32
        network = builder.create_network()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30

        inputT0 = network.add_input('inputT0', trt.float32, shape[1:])
        pluginLayer = network.add_plugin_v2([inputT0],
                                            getAddScalarPlugin(scalar))
        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
    for i in range(nOutput):
        bufferH.append(
            np.empty(
                (shape[0], ) + tuple(context.get_binding_shape(nInput + i)),
                dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute(shape[0], bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = addScalarCPU(bufferH[:nInput], scalar)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
def run(shape0, shape1, scalar):
    testCase = "<shape0:%s,shape1:%s,scalar=%f>" % (shape0, shape1, scalar)
    trtFile = "./model-Dims" + str(len(shape0)) + ".plan"
    print("\nTest", testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile0 = builder.create_optimization_profile()
        profile1 = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30
        config.flags = 1 << int(trt.BuilderFlag.FP16)  # 注释掉这一行,Pugin 就仅使用 FP32

        inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape0])
        profile0.set_shape(inputT0.name, [1 for i in shape0], [8 for i in shape0], [32 for i in shape0])
        config.add_optimization_profile(profile0)
        profile1.set_shape(inputT0.name, [1 for i in shape1], [8 for i in shape1], [32 for i in shape1])
        config.add_optimization_profile(profile1)

        pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar))

        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    stream = 0  # 使用默认 CUDA 流
    cudart.cudaStreamSynchronize(stream)

    # 使用 Profile 0
    print("Use Profile 0")
    context.set_optimization_profile_async(0, stream)
    cudart.cudaStreamSynchronize(stream)
    #context.active_optimization_profile = 0  # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃
    context.set_binding_shape(0, shape0)
    print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))
    for i in range(engine.num_bindings):
        print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.random.rand(np.prod(shape0)).reshape(shape0).astype(np.float32) * 2 - 1
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMalloc(inputH0.nbytes)
    _, outputD0 = cudart.cudaMalloc(outputH0.nbytes)

    cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
    print("before inference")
    context.execute_v2([int(inputD0), int(outputD0), int(0), int(0)])
    print("after inference")
    cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    # 使用 Profile 1
    print("Use Profile 1")
    context.set_optimization_profile_async(1, stream)
    cudart.cudaStreamSynchronize(stream)
    #context.active_optimization_profile = 1  # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃
    context.set_binding_shape(2, shape1)
    print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))
    for i in range(engine.num_bindings):
        print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.random.rand(np.prod(shape1)).reshape(shape1).astype(np.float32) * 2 - 1
    inputH1 = np.ascontiguousarray(data.reshape(-1))
    outputH1 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2)))
    _, inputD1 = cudart.cudaMalloc(inputH1.nbytes)
    _, outputD1 = cudart.cudaMalloc(outputH1.nbytes)

    cudart.cudaMemcpy(inputD1, inputH1.ctypes.data, inputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
    print("before inference")
    context.execute_v2([int(0), int(0), int(inputD1), int(outputD1)])
    print("after inference")
    cudart.cudaMemcpy(outputH1.ctypes.data, outputD1, outputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    cudart.cudaFree(inputD0)
    cudart.cudaFree(inputD1)
    cudart.cudaFree(outputD0)
    cudart.cudaFree(outputD1)
def run(shape, scalar):
    testCase = "<shape=%s,scalar=%f>" % (shape, scalar)
    trtFile = "./model-Dim%s.plan" % str(len(shape))
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30
        config.flags = 1 << int(trt.BuilderFlag.INT8)
        config.int8_calibrator = MyCalibrator(1, shape, cacheFile)

        inputT0 = network.add_input('inputT0', trt.float32,
                                    [-1 for i in shape])
        profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape],
                          [32 for i in shape])
        config.add_optimization_profile(profile)
        #inputT0.dynamic_range = [-100,100]  # 不使用 calibrator 的时候要手动设置 dynamic range

        pluginLayer = network.add_plugin_v2([inputT0],
                                            getAddScalarPlugin(scalar))
        pluginLayer.precision = trt.int8
        pluginLayer.set_output_type(0, trt.int8)
        pluginLayer.get_output(0).dtype = trt.int8
        #pluginLayer.get_output(0).dynamic_range = [-120,120]

        identityLayer = network.add_identity(
            pluginLayer.get_output(0))  # 手动转为 float32 类型,否则要自行处理输出的 int8 类型
        identityLayer.get_output(0).dtype = trt.float32

        network.mark_output(identityLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, shape)
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(
        np.random.rand(np.prod(shape)).astype(np.float32).reshape(shape) *
        200 - 100)
    for i in range(nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(nInput + i),
                     dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = addScalarCPU(bufferH[:nInput], scalar)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
예제 #15
0
def test(engine, context, nBatchSize):
    nProfile = engine.num_optimization_profiles
    if nProfile == 1:
        bindingBias = 0
    else:
        if nBatchSize <= 4:
            bindingBias = 0
            context.set_optimization_profile_async(0, 0)
            cudart.cudaStreamSynchronize(0)
        else:
            bindingBias = 2
            context.set_optimization_profile_async(1, 0)
            cudart.cudaStreamSynchronize(0)

    context.set_binding_shape(bindingBias, [nBatchSize, 1])
    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    for i in range(nInput):
        print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
    for i in range(nInput, nInput + nOutput):
        print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    nInput = nInput // nProfile
    nOutput = nOutput // nProfile

    data = np.random.rand(nBatchSize).reshape(nBatchSize, 1).astype(np.float32)
    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(np.empty(context.get_binding_shape(bindingBias + i), dtype=trt.nptype(engine.get_binding_dtype(bindingBias + i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    if nProfile == 1 or nBatchSize <= 4:
        bufferD = bufferD + [int(0), int(0)]
    else:
        bufferD = [int(0), int(0)] + bufferD

    for i in range(nInput):
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
    context.execute_v2(bufferD)
    for i in range(nInput, nInput + nOutput):
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    for i in range(nWarm):
        context.execute_v2(bufferD)

    t0 = time_ns()
    for i in range(nTest):
        context.execute_v2(bufferD)
    t1 = time_ns()
    print("+---- BatchSize=%2d: %.4fms\n" % (nBatchSize, (t1 - t0) / 1e6 / nTest))

    if nProfile == 1 or nBatchSize <= 4:
        bufferD = bufferD[:2]
    else:
        bufferD = bufferD[-2:]

    for b in bufferD:
        cudart.cudaFree(b)
def run():
    testCase = "%d-%d-%d-fp%s" % (nBS, nSL, nEmbedding, '16' if int(npDataType == np.float16) else '32')
    print("Test <%s>" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)

    trtFile = "./model-" + testCase + ".plan"
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engineStr = f.read()
            engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr)
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(1 << 0)
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30
        config.flags = 1 << int(trt.BuilderFlag.FP16) if int(npDataType == np.float16) else 0

        inputTensorList = []
        trtDataType = trt.float16 if int(npDataType == np.float16) else trt.float32
        inputTensorList.append(network.add_input('inputT', trtDataType, [-1, -1, -1]))

        profile = builder.create_optimization_profile()
        profile.set_shape('inputT', [1, 1, nEmbedding], [nBS, nSL, nEmbedding], [nBS * 2, nSL * 2, nEmbedding])
        config.add_optimization_profile(profile)

        pluginLayer = network.add_plugin_v2(inputTensorList, getLayerNormPlugin())
        pluginLayer.get_output(0).dtype = trtDataType

        network.mark_output(pluginLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [nBS, nSL, nEmbedding])

    print("Binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))

    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    for i in range(engine.num_bindings):
        print("input ->" if engine.binding_is_input(i) else "output->", engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i))

    bufferH = []
    bufferH.append(np.random.rand(nBS, nSL, nEmbedding).astype(np.float32).reshape(nBS, nSL, nEmbedding) * 2 - 1)
    bufferH.append(np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))))

    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nInput, nInput + nOutput):
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    resCPU = layerNormCPU(bufferH, epsilon)[-1]
    #printArrayInfo(resCPU)
    #printArrayInfo(bufferH[-1])
    check(bufferH[-1], resCPU, True)

    for b in bufferD:
        cudart.cudaFree(b)

    print("Test <%s> finish!" % testCase)
def run():
    logger = trt.Logger(trt.Logger.ERROR)                                       # 指定 Logger,可用等级:VERBOSE,INFO,WARNING,ERRROR,INTERNAL_ERROR
    if os.path.isfile(trtFile):                                                 # 如果有 .plan 文件则直接读取
        with open(trtFile, 'rb') as f:
            engineString = f.read()
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
    else:                                                                       # 没有 .plan 文件,从头开始创建
        builder = trt.Builder(logger)                                           # 网络元信息,Builder/Network/BuilderConfig/Profile 相关
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1])  # 指定输入张量
        profile.set_shape(inputTensor.name, [1, 1, 1], [3, 4, 5], [6, 8, 10])   # 指定输入张量 Dynamic Shape 范围
        config.add_optimization_profile(profile)

        identityLayer = network.add_identity(inputTensor)                       # 恒等变换
        network.mark_output(identityLayer.get_output(0))                        # 标记输出张量

        engineString = builder.build_serialized_network(network, config)        # 生成序列化网络
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
        with open(trtFile, 'wb') as f:                                          # 将序列化网络保存为 .plan 文件
            f.write(engineString)
            print("Succeeded saving .plan file!")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)          # 使用 Runtime 来创建 engine
    if engine == None:
        print("Failed building engine!")
        return
    print("Succeeded building engine!")

    context = engine.create_execution_context()                                 # 创建 context(相当于 GPU 进程)
    context.set_binding_shape(0, [3, 4, 5])                                     # Dynamic Shape 模式需要绑定真实数据形状
    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])  # 获取 engine 绑定信息
    nOutput = engine.num_bindings - nInput
    for i in range(nInput):
        print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
    for i in range(nInput,nInput+nOutput):
        print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5)              # 准备数据和 Host/Device 端内存
    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):                                                     # 首先将 Host 数据拷贝到 Device 端
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)                                                 # 运行推理计算

    for i in range(nInput, nInput + nOutput):                                   # 将结果从 Device 端拷回 Host 端
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    for i in range(nInput + nOutput):
        print(engine.get_binding_name(i))
        print(bufferH[i].reshape(context.get_binding_shape(i)))

    for b in bufferD:                                                           # 释放 Device 端内存
        cudart.cudaFree(b)
convQDQLayer = network.add_dequantize(convQLayer.get_output(0), qTensor)
convQDQLayer.axis = 0

network.mark_output(convQDQLayer.get_output(0))
engineString = builder.build_serialized_network(network, config)
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
context = engine.create_execution_context()
_, stream = cudart.cudaStreamCreate()

inputH0 = np.ascontiguousarray(data.reshape(-1))
outputH0 = np.empty(context.get_binding_shape(1),
                    dtype=trt.nptype(engine.get_binding_dtype(1)))
_, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
_, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                       cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
context.execute_async_v2([int(inputD0), int(outputD0)], stream)
cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
cudart.cudaStreamSynchronize(stream)

print("inputH0 :", data.shape)
print(data)
print("outputH0:", outputH0.shape)
print(outputH0)

cudart.cudaStreamDestroy(stream)
cudart.cudaFree(inputD0)
cudart.cudaFree(outputD0)
def test_tf_nn_linalg_matmul():
    print("\ntf.nn.linalg.matmul -----------------------------------------------")
    # TensorFlow part ----------------------------------------------------------
    x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x')
    weight = tf.compat.v1.get_variable('w1', shape=[hIn * wIn * cIn, cOut], initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1))
    _h1 = tf.reshape(x, [-1, hIn * wIn * cIn])
    y       = tf.linalg.matmul( \
                _h1,
                weight,
                transpose_a=False,
                transpose_b=False,
                adjoint_a=False,
                adjoint_b=False,
                a_is_sparse=False,
                b_is_sparse=False,
                name='y'
                )

    tfConfig = tf.compat.v1.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.compat.v1.Session(config=tfConfig)
    sess.run(tf.compat.v1.global_variables_initializer())

    outputTF = sess.run(y, feed_dict={x: inputData})
    tfPara = {}  # 保存权重
    print("Weight:")
    for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES):
        name, value = i.name, sess.run(i)
        print(name, value.shape)
        tfPara[name] = value
    np.savez("para_tf_nn_linalg_matmul.npz", **tfPara)
    sess.close()

    # TensorRT part ------------------------------------------------------------
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    profile = builder.create_optimization_profile()
    config = builder.create_builder_config()
    inputT0 = network.add_input('inputT0', trt.float32, (-1, hIn, wIn, cIn))
    profile.set_shape(inputT0.name, (1, hIn, wIn, cIn), (nIn, hIn, wIn, cIn), (nIn * 2, hIn, wIn, cIn))  # 范围覆盖住之后需要的值就好
    config.add_optimization_profile(profile)

    weight = np.load('./para_tf_nn_linalg_matmul.npz')['w1:0'].transpose(1, 0).reshape(-1)  # 读取权重
    _h1 = network.add_fully_connected(inputT0, cOut, weight, None)
    _h2 = network.add_shape(_h1.get_output(0))  # 把最后两维的 (1,1) 去掉,对齐 TF 模型
    _h3 = network.add_slice(_h2.get_output(0), [0], [2], [1])
    _h4 = network.add_shuffle(_h1.get_output(0))
    _h4.set_input(1, _h3.get_output(0))

    network.mark_output(_h4.get_output(0))
    engineString = builder.build_serialized_network(network, config)
    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, hIn, wIn, cIn])
    _, stream = cudart.cudaStreamCreate()

    inputH0 = np.ascontiguousarray(inputData.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    printArray(inputData, "input")
    #print(inputData)
    printArray(outputTF, "TF output")
    #print(outputTF)
    printArray(outputH0, "TRT output")
    #print(outputH0)
    check(outputTF, outputH0, True)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)