def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engineString = f.read() if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1]) profile.set_shape(inputTensor.name, [1, 1, 1], [cIn, hIn, wIn], [cIn * 2, hIn * 2, wIn * 2]) config.add_optimization_profile(profile) identityLayer = network.add_identity(inputTensor) network.mark_output(identityLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") with open(trtFile, 'wb') as f: f.write(engineString) print("Succeeded saving .plan file!") engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) if engine == None: print("Failed building engine!") return print("Succeeded building engine!") context = engine.create_execution_context() context.set_binding_shape(0, [cIn, hIn, wIn]) _, stream = cudart.cudaStreamCreate() nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput for i in range(nInput): print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput, nInput + nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) npData = [] bufferSize = [] bufferH = [] bufferD = [] for i in range(nInput): # 输入 numpy 数组和返回 numpy 数组 bufferSize.append( trt.volume(context.get_binding_shape(i)) * engine.get_binding_dtype(i).itemsize) npData.append( np.arange(cIn * hIn * wIn, dtype=np.float32).reshape(cIn, hIn, wIn)) for i in range(nInput, nInput + nOutput): bufferSize.append( trt.volume(context.get_binding_shape(i)) * engine.get_binding_dtype(i).itemsize) npData.append( np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) for i in range(nInput + nOutput): # 申请 Host 端页锁定内存和 Device 端显存 bufferH.append( cudart.cudaHostAlloc(bufferSize[i], cudart.cudaHostAllocWriteCombined)[1]) bufferD.append(cudart.cudaMallocAsync(bufferSize[i], stream)[1]) for i in range(nInput): # numpy 数组 -> 页锁定内存 cudart.cudaMemcpyAsync(bufferH[i], npData[i].ctypes.data, bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferH, stream) # 直接使用页锁定内存 for i in range(nInput, nInput + nOutput): # 页锁定内存 -> 返回 numpy 数组 cudart.cudaMemcpyAsync(npData[i].ctypes.data, bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) for i in range(nInput + nOutput): print(engine.get_binding_name(i)) print(npData[i].reshape(context.get_binding_shape(i))) for b in bufferH: cudart.cudaFreeHost(b) for b in bufferD: cudart.cudaFreeAsync(b, stream) cudart.cudaStreamDestroy(stream)
dtype=np.float32)).get_output(0) convQLayer = network.add_quantize(convolutionLayer.get_output(0), qTensor) convQLayer.axis = 0 convQDQLayer = network.add_dequantize(convQLayer.get_output(0), qTensor) convQDQLayer.axis = 0 network.mark_output(convQDQLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("inputH0 :", data.shape) print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream)
def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputList = [] for i in range(nGEMM + 1): inputT = network.add_input('inputT' + str(i), trt.float32, [-1, 4, sizeGEMM, sizeGEMM]) profile.set_shape(inputT.name, (1, 4, sizeGEMM, sizeGEMM), (4, 4, sizeGEMM, sizeGEMM), (sizeGEMM, 4, sizeGEMM, sizeGEMM)) inputList.append(inputT) config.add_optimization_profile(profile) tempTensor = inputList[0] for i in range(1, nGEMM + 1): tempLayer = network.add_matrix_multiply(tempTensor, trt.MatrixOperation.NONE, inputList[i], trt.MatrixOperation.NONE) tempTensor = tempLayer.get_output(0) network.mark_output(tempLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() for i in range(nGEMM + 1): context.set_binding_shape(i, [4, 4, sizeGEMM, sizeGEMM]) stream = cudart.cudaStreamCreate()[1] bufferSize = [ trt.volume(context.get_binding_shape(i)) * np.array([0], dtype=trt.nptype(engine.get_binding_dtype(i))).nbytes for i in range(engine.num_bindings) ] bufferH = [] bufferD = [] for i in range(nGEMM + 2): bufferH.append( cudart.cudaHostAlloc(bufferSize[i], cudart.cudaHostAllocWriteCombined)[1]) bufferD.append(cudart.cudaMallocAsync(bufferSize[i], stream)[1]) # 不用 CUDA Graph 来执行 for i in range(nGEMM + 1): cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferD, stream) cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) for n in range(nInference): for i in range(nGEMM + 1): cudart.cudaMemcpyAsync( bufferD[i], bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferD, stream) cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) # 捕获 CUDA Graph 并运行 cudart.cudaStreamBeginCapture( stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) for i in range(nGEMM + 1): cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferD, stream) cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) #cudart.cudaStreamSynchronize(stream) # 不用在 graph 内同步 _, graph = cudart.cudaStreamEndCapture(stream) _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0) cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) for n in range(nInference): cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) for i in range(nGEMM + 2): cudart.cudaFree(bufferD[i]) cudart.cudaStreamDestroy(stream)
def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") onnxFile = onnxFile1 # 已经有 model.plan,读进 model1.onnx 做 Refit else: onnxFile = onnxFile0 # 还没有 model.plan,先用 model0.onnx 构建 model.plan builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.REFIT) config.max_workspace_size = 3 << 30 parser = trt.OnnxParser(network, logger) if not os.path.exists(onnxFile): print("Failed finding .onnx file!") exit() print("Succeeded finding .onnx file!") with open(onnxFile, 'rb') as model: if not parser.parse(model.read()): print("Failed parsing .onnx file!") for error in range(parser.num_errors): print(parser.get_error(error)) exit() print("Succeeded parsing .onnx file!") if os.path.isfile(trtFile): # 进行 Refit refitter = trt.Refitter(engine, logger) layerNameList, weightRoleList = refitter.get_all() for name, role in zip(layerNameList, weightRoleList): print("LayerName:%s,WeightRolw:%s"%(name, role)) for i in range(network.num_layers): layer = network.get_layer(i) if layer.name in layerNameList: # 据实际网络情况,可能需要添加更多 Layer if layer.type == trt.LayerType.CONVOLUTION: layer.__class__ = trt.IConvolutionLayer refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel) refitter.set_weights(layer.name, trt.WeightsRole.BIAS, layer.bias) layerNameList.remove if layer.type == trt.LayerType.FULLY_CONNECTED: layer.__class__ = trt.IFullyConnectedLayer refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel) if layer.type == trt.LayerType.CONSTANT: layer.__class__ = trt.IConstantLayer refitter.set_weights(layer.name, trt.WeightsRole.CONSTANT, layer.weights) if refitter.refit_cuda_engine() == False: print("Failed refitting engine, missing weight:") [missingLayer, weightRole] = refitter.get_missing() for layer, role in zip(missingLayer, weightRole): print("\tLayerName:%s,WeightRolw:%s"%(name, role)) return print("Succeeded refitting engine!") else: # 构建 model.plan inputTensor = network.get_input(0) inputTensor.shape = [1, 1, 28, 28] ''' # 逐层打印网络信息 for i in range(network.num_layers): layer = network.get_layer(i) print(i, "%s,in=%d,out=%d,%s" % (str(layer.type)[10:], layer.num_inputs, layer.num_outputs, layer.name)) for j in range(layer.num_inputs): tensor = layer.get_input(j) if tensor == None: print("\tInput %2d:" % j, "None") else: print("\tInput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name)) for j in range(layer.num_outputs): tensor = layer.get_output(j) if tensor == None: print("\tOutput %2d:" % j, "None") else: print("\tOutput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name)) ''' engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") exit() print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [1, 1, 28, 28]) _, stream = cudart.cudaStreamCreate() print("Binding0->", engine.get_binding_shape(0), context.get_binding_shape(0), engine.get_binding_dtype(0)) print("Binding1->", engine.get_binding_shape(1), context.get_binding_shape(1), engine.get_binding_dtype(1)) data = cv2.imread(inputImage, cv2.IMREAD_GRAYSCALE).astype(np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("inputH0 :", data.shape) #print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0) print("Succeeded running model in TensorRT!")
def run1(engine): context = engine.create_execution_context() context.set_binding_shape(0, [nIn, cIn, hIn, wIn]) _, stream = cudart.cudaStreamCreate() data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape( nIn, cIn, hIn, wIn) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) # 完整一次推理 cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) # 数据拷贝 HtoD 计时 for i in range(10): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) trtTimeStart = time() for i in range(30): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, DataCopyHtoD" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) # 推理计时 for i in range(10): context.execute_async_v2([int(inputD0), int(outputD0)], stream) trtTimeStart = time() for i in range(30): context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, Inference" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) # 数据拷贝 DtoH 计时 for i in range(10): cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) trtTimeStart = time() for i in range(30): cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, DataCopyDtoH" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) # 总时间计时 for i in range(10): context.execute_async_v2([int(inputD0), int(outputD0)], stream) trtTimeStart = time() for i in range(30): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, DataCopy + Inference" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run(nRunTime): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.REFIT) inputT0 = network.add_input('inputT0', trt.float32, (nIn, cIn, hIn, wIn)) fakeWeight = np.zeros([cOut, cIn, wW, wW], dtype=np.float32) fakeBias = np.zeros([cOut], dtype=np.float32) convolutionLayer = network.add_convolution_nd(inputT0, cOut, (hW, wW), fakeWeight, fakeBias) #convolutionLayer.name = 'conv' network.set_weights_name(convolutionLayer.kernel, "conv-w") network.set_weights_name(convolutionLayer.bias, "conv-b") network.mark_output(convolutionLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) if nRunTime == 0: print("Do not refit!") else: print("Refit!") refitter = trt.Refitter(engine, logger) refitter.set_named_weights("conv-w", weight) refitter.set_named_weights("conv-b", bias) [missingLayer, weightRole] = refitter.get_missing() for layer, role in zip(missingLayer, weightRole): print("[", layer, "-", role, "]") if refitter.refit_cuda_engine() == False: print("Failed Refitting engine!") return context = engine.create_execution_context() _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("data:", data.shape) print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run2(engine): context = engine.create_execution_context() context.set_binding_shape(0, [nIn, cIn, hIn, wIn]) _, stream0 = cudart.cudaStreamCreate() _, stream1 = cudart.cudaStreamCreate() _, event0 = cudart.cudaEventCreate() _, event1 = cudart.cudaEventCreate() data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape( nIn, cIn, hIn, wIn) inputSize = trt.volume(context.get_binding_shape(0)) * np.array( [0], dtype=trt.nptype(engine.get_binding_dtype(0))).nbytes outputSize = trt.volume(context.get_binding_shape(1)) * np.array( [0], dtype=trt.nptype(engine.get_binding_dtype(1))).nbytes _, inputH0 = cudart.cudaHostAlloc(inputSize, cudart.cudaHostAllocWriteCombined) _, inputH1 = cudart.cudaHostAlloc(inputSize, cudart.cudaHostAllocWriteCombined) _, outputH0 = cudart.cudaHostAlloc(outputSize, cudart.cudaHostAllocWriteCombined) _, outputH1 = cudart.cudaHostAlloc(outputSize, cudart.cudaHostAllocWriteCombined) _, inputD0 = cudart.cudaMallocAsync(inputSize, stream0) _, inputD1 = cudart.cudaMallocAsync(inputSize, stream1) _, outputD0 = cudart.cudaMallocAsync(outputSize, stream0) _, outputD1 = cudart.cudaMallocAsync(outputSize, stream1) # 总时间计时 for i in range(10): context.execute_async_v2([int(inputD0), int(outputD0)], stream0) trtTimeStart = time() cudart.cudaEventRecord(event1, stream1) for i in range(30): inputH, outputH = [inputH1, outputH1] if i & 1 else [inputH0, outputH0] inputD, outputD = [inputD1, outputD1] if i & 1 else [inputD0, outputD0] eventBefore, eventAfter = [event0, event1 ] if i & 1 else [event1, event0] stream = stream1 if i & 1 else stream0 cudart.cudaMemcpyAsync(inputD, inputH, inputSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) cudart.cudaStreamWaitEvent(stream, eventBefore, cudart.cudaEventWaitDefault) context.execute_async_v2([int(inputD), int(outputD)], stream) cudart.cudaEventRecord(eventAfter, stream) cudart.cudaMemcpyAsync(outputH, outputD, outputSize, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) '''# 奇偶循环拆开写 for i in range(30//2): cudart.cudaMemcpyAsync(inputD0, inputH0, inputSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream0) cudart.cudaStreamWaitEvent(stream0,event1,cudart.cudaEventWaitDefault) context.execute_async_v2([int(inputD0), int(outputD0)], stream0) cudart.cudaEventRecord(event0,stream0) cudart.cudaMemcpyAsync(outputH0, outputD0, outputSize, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream0) cudart.cudaMemcpyAsync(inputD1, inputH1, inputSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream1) cudart.cudaStreamWaitEvent(stream1,event0,cudart.cudaEventWaitDefault) context.execute_async_v2([int(inputD1), int(outputD1)], stream1) cudart.cudaEventRecord(event1,stream1) cudart.cudaMemcpyAsync(outputH1, outputD1, outputSize, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream1) ''' cudart.cudaEventSynchronize(event1) trtTimeEnd = time() print("%6.3fms - 2 stream, DataCopy + Inference" % ((trtTimeEnd - trtTimeStart) / 30 * 1000))
context1 = engine.create_execution_context() context0.set_optimization_profile_async(0, stream0) context1.set_optimization_profile_async(1, stream1) context0.set_binding_shape(0, [nIn, cIn, hIn, wIn]) context1.set_binding_shape(2, [nIn, cIn, hIn, wIn]) print("Context0 binding all? %s" % (["No", "Yes"][int(context0.all_binding_shapes_specified)])) print("Context1 binding all? %s" % (["No", "Yes"][int(context1.all_binding_shapes_specified)])) for i in range(engine.num_bindings): print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context0.get_binding_shape(i), context1.get_binding_shape(i)) inputH0 = np.ascontiguousarray(data.reshape(-1)) inputH1 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context0.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) outputH1 = np.empty(context1.get_binding_shape(3), dtype=trt.nptype(engine.get_binding_dtype(3))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream0) _, inputD1 = cudart.cudaMallocAsync(inputH1.nbytes, stream1) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream0) _, outputD1 = cudart.cudaMallocAsync(outputH1.nbytes, stream1) for _ in range(5): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream0) cudart.cudaMemcpyAsync(inputD1, inputH1.ctypes.data, inputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream1) context0.execute_async_v2([int(inputD0), int(outputD0), int(0), int(0)], stream0) context1.execute_async_v2([int(0), int(0), int(inputD1), int(outputD1)], stream1) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream0) cudart.cudaMemcpyAsync(outputH1.ctypes.data, outputD1, outputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream1) cudart.cudaStreamSynchronize(stream0) cudart.cudaStreamSynchronize(stream1)
def test_tf_nn_conv2d(): print( "\ntf.nn.conv2d ------------------------------------------------------" ) # TensorFlow part ---------------------------------------------------------- x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x') weight = tf.compat.v1.get_variable( 'w1', shape=[hW, wW, cIn, cOut], initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1)) y = tf.nn.conv2d( \ x, filter=weight, strides=None, padding='SAME', use_cudnn_on_gpu=True, data_format='NHWC', dilations=[1, 1, 1, 1], name='y', filters=None ) tfConfig = tf.compat.v1.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.compat.v1.Session(config=tfConfig) sess.run(tf.compat.v1.global_variables_initializer()) outputTF = sess.run(y, feed_dict={x: inputData}) tfPara = {} # 保存权重 print("Weight:") for i in tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES): name, value = i.name, sess.run(i) print(name, value.shape) tfPara[name] = value np.savez("para_tf_nn_conv2d.npz", **tfPara) sess.close() # TensorRT part ------------------------------------------------------------ logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() inputT0 = network.add_input('inputT0', trt.float32, (-1, -1, -1, cIn)) profile.set_shape(inputT0.name, (1, 1, 1, cIn), (nIn, hIn, wIn, cIn), (nIn * 2, hIn * 2, wIn * 2, cIn)) # 范围覆盖住之后需要的值就好 config.add_optimization_profile(profile) _h1 = network.add_shuffle(inputT0) # NHWC to NCHW _h1.first_transpose = (0, 3, 1, 2) weight = np.load('./para_tf_nn_conv2d.npz')['w1:0'].transpose( 3, 2, 0, 1).reshape(-1) # 读取权重 _h2 = network.add_convolution_nd(_h1.get_output(0), cOut, [hW, wW], weight, None) _h2.padding_nd = (2, 2) _h3 = network.add_shuffle(_h2.get_output(0)) # NCHW to NHWC,与 TF 模型保持一致 _h3.first_transpose = (0, 2, 3, 1) network.mark_output(_h3.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [nIn, hIn, wIn, cIn]) _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(inputData.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) printArray(inputData, "input") #print(inputData) printArray(outputTF, "TF output") #print(outputTF) printArray(outputH0, "TRT output") #print(outputH0) check(outputTF, outputH0, True) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1]) profile.set_shape(inputTensor.name, (1, 1, 1), (3, 4, 5), (6, 8, 10)) config.add_optimization_profile(profile) identityLayer = network.add_identity(inputTensor) network.mark_output(identityLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [3, 4, 5]) _, stream = cudart.cudaStreamCreate() data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) # 捕获 CUDA Graph 之前需要先运行一次推理 cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) # 捕获 CUDA Graph 并运行 cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) #cudart.cudaStreamSynchronize(stream) # 不用在 graph 内同步 _, graph = cudart.cudaStreamEndCapture(stream) _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0) cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) print("outputH0Big:", outputH0.shape) print(outputH0) # 输入尺寸改变后,也需要先运行一次推理,再重新捕获 CUDA Graph,最后再运行 context.set_binding_shape(0, [2, 3, 4]) inputH0 = np.ascontiguousarray(-data[:2 * 3 * 4].reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) _, graph = cudart.cudaStreamEndCapture(stream) _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0) cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) print("outputH0Small:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def test_tf_nn_linalg_matmul(): print("\ntf.nn.linalg.matmul -----------------------------------------------") # TensorFlow part ---------------------------------------------------------- x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x') weight = tf.compat.v1.get_variable('w1', shape=[hIn * wIn * cIn, cOut], initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1)) _h1 = tf.reshape(x, [-1, hIn * wIn * cIn]) y = tf.linalg.matmul( \ _h1, weight, transpose_a=False, transpose_b=False, adjoint_a=False, adjoint_b=False, a_is_sparse=False, b_is_sparse=False, name='y' ) tfConfig = tf.compat.v1.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.compat.v1.Session(config=tfConfig) sess.run(tf.compat.v1.global_variables_initializer()) outputTF = sess.run(y, feed_dict={x: inputData}) tfPara = {} # 保存权重 print("Weight:") for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES): name, value = i.name, sess.run(i) print(name, value.shape) tfPara[name] = value np.savez("para_tf_nn_linalg_matmul.npz", **tfPara) sess.close() # TensorRT part ------------------------------------------------------------ logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() inputT0 = network.add_input('inputT0', trt.float32, (-1, hIn, wIn, cIn)) profile.set_shape(inputT0.name, (1, hIn, wIn, cIn), (nIn, hIn, wIn, cIn), (nIn * 2, hIn, wIn, cIn)) # 范围覆盖住之后需要的值就好 config.add_optimization_profile(profile) weight = np.load('./para_tf_nn_linalg_matmul.npz')['w1:0'].transpose(1, 0).reshape(-1) # 读取权重 _h1 = network.add_fully_connected(inputT0, cOut, weight, None) _h2 = network.add_shape(_h1.get_output(0)) # 把最后两维的 (1,1) 去掉,对齐 TF 模型 _h3 = network.add_slice(_h2.get_output(0), [0], [2], [1]) _h4 = network.add_shuffle(_h1.get_output(0)) _h4.set_input(1, _h3.get_output(0)) network.mark_output(_h4.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [nIn, hIn, wIn, cIn]) _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(inputData.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) printArray(inputData, "input") #print(inputData) printArray(outputTF, "TF output") #print(outputTF) printArray(outputH0, "TRT output") #print(outputH0) check(outputTF, outputH0, True) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)