def refit_engine(self, net): with trt.Refitter(self.engine, self.logger) as refitter: state_dict = net.state_dict() variables = [] # Why use a variable list? # we know that in c++ functions, a python array may be deleted # after ref count of a var decrease to zero. # TensorRT 5.1.5.0 refitter ONLY EXECUTED in refit_cuda_engine, # so we must keep variable alive before refit_cuda_engine call. for k, v in self.refit_weight_dict.items(): if v["type"] == "Linear": weight = state_dict[v["weight"]].detach().cpu().numpy() refitter.set_weights(k, trt.WeightsRole.KERNEL, weight) variables.append(weight) if "bias" in v: bias = state_dict[v["bias"]].detach().cpu().numpy() refitter.set_weights(k, trt.WeightsRole.BIAS, bias) variables.append(bias) elif v["type"] == "Convolution": weight = state_dict[ v["weight"]].detach().float().cpu().numpy() refitter.set_weights(k, trt.WeightsRole.KERNEL, weight) variables.append(weight) if "bias" in v: bias = state_dict[v["bias"]].detach().cpu().numpy() refitter.set_weights(k, trt.WeightsRole.BIAS, bias) variables.append(bias) elif v["type"] == "BatchNorm": running_var = state_dict[v["running_var"]] running_mean = state_dict[v["running_mean"]] weight = state_dict[v["weight"]] bias = state_dict[v["bias"]] eps = v["eps"] running_mean = running_mean.detach().cpu().numpy() running_var = running_var.detach().cpu().numpy() weight = weight.detach().cpu().numpy() bias = bias.detach().cpu().numpy() shift = (-running_mean / np.sqrt(running_var + eps)) * weight + bias scale = weight / np.sqrt(running_var + eps) refitter.set_weights(k, trt.WeightsRole.SCALE, scale) refitter.set_weights(k, trt.WeightsRole.SHIFT, shift) variables.append(scale) variables.append(shift) else: raise NotImplementedError # Get description of missing weights. This should return empty # lists in this case. [missingLayers, weightRoles] = refitter.get_missing() assert len( missingLayers ) == 0, "Refitter found missing weights. Call set_weights() for all missing weights" # Refit the engine with the new weights. This will return True if # the refit operation succeeded. assert refitter.refit_cuda_engine()
def main(): onnx_file_path = 'bidaf-modified.onnx' engine_file_path = "bidaf.trt" # input context = 'A quick brown fox jumps over the lazy dog.' query = 'What color is the fox?' cw_str, _ = preprocess(context) # get ravelled data cw, cc, qw, qc = get_inputs(context, query) # Do inference with TensorRT refit_weights = np.load("Parameter576_B_0.npy") fake_weights = np.ones_like(refit_weights) engine = get_engine(onnx_file_path, engine_file_path) refitter = trt.Refitter(engine, TRT_LOGGER) context = engine.create_execution_context() for weights, answer_correct in [(fake_weights, False), (refit_weights, True)]: print("Refitting engine...") # To get a list of all refittable weights' names # in the network, use refitter.get_all_weights(). # Refit named weights via set_named_weights refitter.set_named_weights('Parameter576_B_0', weights) # Get missing weights names. This should return empty # lists in this case. missing_weights = refitter.get_missing_weights() assert len( missing_weights) == 0, "Refitter found missing weights. Call set_named_weights() or set_weights() for all missing weights" # Refit the engine with the new weights. This will return True if # the refit operation succeeded. assert refitter.refit_cuda_engine() inputs, outputs, bindings, stream = common.allocate_buffers(engine) print("Doing inference...") # Do inference # Set host input. The common.do_inference_v2 function will copy the input to the GPU before executing. inputs[0].host = cw inputs[1].host = cc inputs[2].host = qw inputs[3].host = qc trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) start = np.asscalar(trt_outputs[0]) end = np.asscalar(trt_outputs[1]) answer = [w.encode() for w in cw_str[start:end + 1].reshape(-1)] assert answer_correct == (answer == [b'brown']) print("Passed")
def main(): common.add_help(description="Runs an MNIST network using a PyTorch model") # Train the PyTorch model mnist_model = model.MnistModel() mnist_model.learn() weights = mnist_model.get_weights() # Do inference with TensorRT. with build_engine_with_some_missing_weights(weights) as engine: # Build an engine, allocate buffers and create a stream. # For more information on buffer allocation, refer to the introductory samples. inputs, outputs, bindings, stream = common.allocate_buffers(engine) print("Accuracy Before Engine Refit") get_trt_test_accuracy(engine, inputs, outputs, bindings, stream, mnist_model) # Refit the engine with the actual trained weights for the conv_1 layer. with trt.Refitter(engine, TRT_LOGGER) as refitter: # To get a list of all refittable layers and associated weightRoles # in the network, use refitter.get_all() # Set the actual weights for the conv_1 layer. Since it consists of # kernel weights and bias weights, set each of them by specifying # the WeightsRole. refitter.set_weights("conv_1", trt.WeightsRole.KERNEL, weights['conv1.weight'].numpy()) refitter.set_weights("conv_1", trt.WeightsRole.BIAS, weights['conv1.bias'].numpy()) # Get description of missing weights. This should return empty # lists in this case. [missingLayers, weightRoles] = refitter.get_missing() assert len( missingLayers ) == 0, "Refitter found missing weights. Call set_weights() for all missing weights" # Refit the engine with the new weights. This will return True if # the refit operation succeeded. assert refitter.refit_cuda_engine() expected_correct_predictions = mnist_model.get_latest_test_set_accuracy( ) print( "Accuracy After Engine Refit (expecting {:.1f}% correct predictions)" .format(100 * expected_correct_predictions)) assert get_trt_test_accuracy( engine, inputs, outputs, bindings, stream, mnist_model) >= expected_correct_predictions
def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") onnxFile = onnxFile1 # 已经有 model.plan,读进 model1.onnx 做 Refit else: onnxFile = onnxFile0 # 还没有 model.plan,先用 model0.onnx 构建 model.plan builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.REFIT) config.max_workspace_size = 3 << 30 parser = trt.OnnxParser(network, logger) if not os.path.exists(onnxFile): print("Failed finding .onnx file!") exit() print("Succeeded finding .onnx file!") with open(onnxFile, 'rb') as model: if not parser.parse(model.read()): print("Failed parsing .onnx file!") for error in range(parser.num_errors): print(parser.get_error(error)) exit() print("Succeeded parsing .onnx file!") if os.path.isfile(trtFile): # 进行 Refit refitter = trt.Refitter(engine, logger) layerNameList, weightRoleList = refitter.get_all() for name, role in zip(layerNameList, weightRoleList): print("LayerName:%s,WeightRolw:%s"%(name, role)) for i in range(network.num_layers): layer = network.get_layer(i) if layer.name in layerNameList: # 据实际网络情况,可能需要添加更多 Layer if layer.type == trt.LayerType.CONVOLUTION: layer.__class__ = trt.IConvolutionLayer refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel) refitter.set_weights(layer.name, trt.WeightsRole.BIAS, layer.bias) layerNameList.remove if layer.type == trt.LayerType.FULLY_CONNECTED: layer.__class__ = trt.IFullyConnectedLayer refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel) if layer.type == trt.LayerType.CONSTANT: layer.__class__ = trt.IConstantLayer refitter.set_weights(layer.name, trt.WeightsRole.CONSTANT, layer.weights) if refitter.refit_cuda_engine() == False: print("Failed refitting engine, missing weight:") [missingLayer, weightRole] = refitter.get_missing() for layer, role in zip(missingLayer, weightRole): print("\tLayerName:%s,WeightRolw:%s"%(name, role)) return print("Succeeded refitting engine!") else: # 构建 model.plan inputTensor = network.get_input(0) inputTensor.shape = [1, 1, 28, 28] ''' # 逐层打印网络信息 for i in range(network.num_layers): layer = network.get_layer(i) print(i, "%s,in=%d,out=%d,%s" % (str(layer.type)[10:], layer.num_inputs, layer.num_outputs, layer.name)) for j in range(layer.num_inputs): tensor = layer.get_input(j) if tensor == None: print("\tInput %2d:" % j, "None") else: print("\tInput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name)) for j in range(layer.num_outputs): tensor = layer.get_output(j) if tensor == None: print("\tOutput %2d:" % j, "None") else: print("\tOutput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name)) ''' engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") exit() print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [1, 1, 28, 28]) _, stream = cudart.cudaStreamCreate() print("Binding0->", engine.get_binding_shape(0), context.get_binding_shape(0), engine.get_binding_dtype(0)) print("Binding1->", engine.get_binding_shape(1), context.get_binding_shape(1), engine.get_binding_dtype(1)) data = cv2.imread(inputImage, cv2.IMREAD_GRAYSCALE).astype(np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("inputH0 :", data.shape) #print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0) print("Succeeded running model in TensorRT!")
def run(nRunTime): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.REFIT) inputT0 = network.add_input('inputT0', trt.float32, (nIn, cIn, hIn, wIn)) fakeWeight = np.zeros([cOut, cIn, wW, wW], dtype=np.float32) fakeBias = np.zeros([cOut], dtype=np.float32) convolutionLayer = network.add_convolution_nd(inputT0, cOut, (hW, wW), fakeWeight, fakeBias) #convolutionLayer.name = 'conv' network.set_weights_name(convolutionLayer.kernel, "conv-w") network.set_weights_name(convolutionLayer.bias, "conv-b") network.mark_output(convolutionLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) if nRunTime == 0: print("Do not refit!") else: print("Refit!") refitter = trt.Refitter(engine, logger) refitter.set_named_weights("conv-w", weight) refitter.set_named_weights("conv-b", bias) [missingLayer, weightRole] = refitter.get_missing() for layer, role in zip(missingLayer, weightRole): print("[", layer, "-", role, "]") if refitter.refit_cuda_engine() == False: print("Failed Refitting engine!") return context = engine.create_execution_context() _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("data:", data.shape) print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def main(): #add_help参考common.py中的实现,实际上是一个命令行参数解析器 common.add_help(description="Runs an MNIST network using a PyTorch model") # Train the PyTorch model #训练相应的模型 #创建一个模型 mnist_model = model.MnistModel() #进行训练 mnist_model.learn() #提取相应的权重 weights = mnist_model.get_weights() # Do inference with TensorRT. #在tensorrt中进行相应的推理 #build_engine_with_some_missing_weights参考本文件中的具体实现 with build_engine_with_some_missing_weights(weights) as engine: # Build an engine, allocate buffers and create a stream. # For more information on buffer allocation, refer to the introductory samples. #allocate_buffers的具体实现参考common.py #分配相应的缓冲区,返回输入输出数据缓冲区指列表和相应的绑定等列表 inputs, outputs, bindings, stream = common.allocate_buffers(engine) print("Accuracy Before Engine Refit") #进行相应的推理并计算准确率 get_trt_test_accuracy(engine, inputs, outputs, bindings, stream, mnist_model) # Refit the engine with the actual trained weights for the conv_1 layer. #用训练过的第一个卷积层的权值重新填充引擎 #Refitter用来更新引擎中的权重,具体参考https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Refitter.html?highlight=refitter#tensorrt.Refitter with trt.Refitter(engine, TRT_LOGGER) as refitter: # To get a list of all refittable layers and associated weightRoles # in the network, use refitter.get_all() # Set the actual weights for the conv_1 layer. Since it consists of # kernel weights and bias weights, set each of them by specifying # the WeightsRole. #set_weights用于给指定的层次指定新的权值 #具体参考https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Refitter.html?highlight=set_weights#tensorrt.Refitter.set_weights refitter.set_weights("conv_1", trt.WeightsRole.KERNEL, weights['conv1.weight'].numpy()) refitter.set_weights("conv_1", trt.WeightsRole.BIAS, weights['conv1.bias'].numpy()) # Get description of missing weights. This should return empty # lists in this case. #get_missing用来获取相应丢失权重的描述 [missingLayers, weightRoles] = refitter.get_missing() #判断是否存在丢失权重的层次 assert len( missingLayers ) == 0, "Refitter found missing weights. Call set_weights() for all missing weights" # Refit the engine with the new weights. This will return True if # the refit operation succeeded. #refit_cuda_engine用来更新相关的引擎,如果成功返回true assert refitter.refit_cuda_engine() #get_latest_test_set_accuracy的具体实现参考model.py中的实现 #用来获取最后一次训练得到的准确率 expected_correct_predictions = mnist_model.get_latest_test_set_accuracy( ) print( "Accuracy After Engine Refit (expecting {:.1f}% correct predictions)" .format(100 * expected_correct_predictions)) #获取相应的tensorrt的推理准确率 assert get_trt_test_accuracy( engine, inputs, outputs, bindings, stream, mnist_model) >= expected_correct_predictions