def __init__(self, model_name='frozen_inference_graph', input_shape=(300, 300), cuda_ctx=None): trt.init_libnvinfer_plugins(TRT_LOGGER, '') import pycuda.autoinit self.inputDims = (3, input_shape[0], input_shape[1]) self.input_shape = input_shape model_loc = os.path.join(WORK_DIR, MODEL_DIR, model_name + '.pb') uff_loc = os.path.join(WORK_DIR, MODEL_DIR, model_name + '.uff') bin_loc = os.path.join(WORK_DIR, MODEL_DIR, model_name + '.bin') self.spec = { 'input_pb': model_loc, 'tmp_uff': uff_loc, 'output_bin': bin_loc, 'num_classes': 91, 'min_size': 0.2, 'max_size': 0.95, 'input_order': [0, 2, 1], # order of loc_data, conf_data, priorbox_data } if not os.path.isfile(bin_loc): self.convert() self.cuda_ctx = cuda_ctx if self.cuda_ctx: self.cuda_ctx.push() self.trt_logger = trt.Logger(trt.Logger.INFO) self.engine = self._load_engine() self.set_context()
def run(): logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() inputH0 = np.ascontiguousarray(np.random.rand(nElement).astype(np.float32).reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputH1 = np.ascontiguousarray(np.random.rand(nElement, nWidth).astype(np.float32).reshape(-1)) inputD1 = cuda.mem_alloc(inputH1.nbytes) outputH0 = np.empty(engine.get_binding_shape(2), dtype=np.float32) outputD0 = cuda.mem_alloc(outputH0.nbytes) outputH1 = np.empty(engine.get_binding_shape(3), dtype=np.float32) outputD1 = cuda.mem_alloc(outputH1.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) context.execute_async(1, [int(inputD0), int(inputD1), int(outputD0), int(outputD1)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) cuda.memcpy_dtoh_async(outputH1, outputD1, stream) stream.synchronize() outputCPUH0, outputCPUH1 = sortCPU(inputH0, inputH1.reshape(nElement, nWidth)) print(np.shape(outputH0), np.shape(outputH1)) print("Check result Key:", "True" if np.mean(np.abs(outputH0.reshape(-1) - outputCPUH0.reshape(-1))) < epsilon else "False") print("Check result Value:", "True" if np.mean(np.abs(outputH1.reshape(-1) - outputCPUH1.reshape(-1))) < epsilon else "False") '''
def run(batchSize, shape): print("test", batchSize, *shape) logger = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, shape) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() data = np.array(np.random.rand(batchSize, *shape) * 2 - 1, dtype=np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) outputH0 = np.empty((batchSize, ) + tuple(context.get_binding_shape(1)), dtype=trt.nptype(engine.get_binding_dtype(1))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) context.execute_async(batchSize, [int(inputD0), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() #print("data:", np.shape(data), data.dtype, np.mean(data), np.var(data), np.max(data), np.min(data)) #print(data) #print("hOut:", np.shape(outputH0), outputH0.dtype, np.mean(outputH0), np.var(outputH0), np.max(outputH0), np.min(outputH0)) #print(outputH0) print("check result:", np.all(np.sign(data) == outputH0), "\n")
def run(): logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() data = np.array([ 7, 5, 6, 4, 4, 2, 5, 3, 3, 9, 9, 7, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ]).reshape(2, 3, 4).astype(np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) context.execute_async(1, [int(inputD0), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() print("input=\n", data) print("real output=\n", outputH0)
def run(nBatchSize, shape, isSum): print("test", nBatchSize, shape, isSum) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, shape, isSum) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() data = np.random.rand(*[nBatchSize, *shape]).astype(np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) outputH0 = np.empty((nBatchSize, ) + tuple(context.get_binding_shape(1)), dtype=trt.nptype(engine.get_binding_dtype(1))) outpuD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) context.execute_async(nBatchSize, [int(inputD0), int(outpuD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outpuD0, stream) stream.synchronize() outputH0CPU = reduceCPU(data, isSum) print("Check result:", ["True" if np.all(outputH0 == outputH0CPU) else "False"][0]) '''
def load(self, model_path: Union[str, Path], **_) -> Model: model_path = Path(model_path) LOGGER.debug(f"Loading TensorRT engine from {model_path}") engine = self._load_engine(model_path) if engine is None: LOGGER.debug( "Unable to load engine without plugins. Loading plugins.") trt.init_libnvinfer_plugins(logger=TRT_LOGGER, namespace="") LOGGER.debug( f"Loading TensorRT engine with plugins from {model_path}") engine = self._load_engine(model_path) if engine is None: raise RuntimeError(f"Could not load ICudaEngine from {model_path}") inputs = {} outputs = {} for binding_idx in range(engine.num_bindings): name = engine.get_binding_name(binding_idx) is_input = engine.binding_is_input(binding_idx) dtype = np.dtype(trt.nptype( engine.get_binding_dtype(binding_idx))).name shape = engine.get_binding_shape(binding_idx) if is_input: inputs[name] = TensorSpec(name, dtype, shape) else: outputs[name] = TensorSpec(name, dtype, shape) return Model(engine, None, inputs, outputs)
def __init__( self, trt_engine_path, onnx_model_path, trt_engine_datatype=trt.DataType.FLOAT, calib_dataset=None, batch_size=1, ): """Initializes TensorRT objects needed for model inference. Args: trt_engine_path (str): path where TensorRT engine should be stored onnx_model_path (str): path of .onnx model trt_engine_datatype (trt.DataType): requested precision of TensorRT engine used for inference batch_size (int): batch size for which engine should be optimized for """ # Suppressed informational messages, and report only warnings and errors TRT_LOGGER = trt.Logger(trt.Logger.WARNING) self.batch_size = batch_size # We first load all custom plugins shipped with TensorRT, # some of them will be needed during inference trt.init_libnvinfer_plugins(TRT_LOGGER, '') # TRT engine placeholder self.trt_engine = None # Display requested engine settings to stdout print(">>> TensorRT inference engine settings:") print(">>> Inference precision: {}".format(trt_engine_datatype)) print(">>> Max batch size: {}".format(batch_size)) # If engine is not cached, we need to build it if not os.path.exists(trt_engine_path): # This function uses supplied .onnx file # alongside with ONNXParser to build TensorRT # engine. For more details, check implmentation self.trt_engine = build_engine(onnx_model_path, TRT_LOGGER) # Save the engine to file save_engine(self.trt_engine, trt_engine_path) # If we get here, the file with engine exists, so we can load it if not self.trt_engine: print(">>> Loading cached TensorRT engine from {}".format( trt_engine_path)) self.trt_engine = load_engine(trt_engine_path, TRT_LOGGER) # This allocates memory for network inputs/outputs on both CPU and GPU self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers( self.trt_engine) # Execution context is needed for inference self.context = self.trt_engine.create_execution_context() # Allocate memory for multiple usage [e.g. multiple batch inference] input_volume = trt.volume(ModelData.INPUT_SHAPE) self.numpy_array = np.zeros( (self.trt_engine.max_batch_size, input_volume))
def main(): parser = argparse.ArgumentParser() parser.add_argument('model', type=str, choices=list(MODEL_SPECS.keys())) args = parser.parse_args() # initialize if trt.__version__[0] < '7': ctypes.CDLL(LIB_FILE) TRT_LOGGER = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(TRT_LOGGER, '') # compile the model into TensorRT engine model = args.model spec = MODEL_SPECS[model] dynamic_graph = add_plugin(gs.DynamicGraph(spec['input_pb']), model, spec) _ = uff.from_tensorflow(dynamic_graph.as_graph_def(), output_nodes=['NMS'], output_filename=spec['tmp_uff'], text=True, debug_mode=DEBUG_UFF) with trt.Builder(TRT_LOGGER) as builder, builder.create_network( ) as network, trt.UffParser() as parser: builder.max_workspace_size = 1 << 28 builder.max_batch_size = 1 builder.fp16_mode = True parser.register_input('Input', INPUT_DIMS) parser.register_output('MarkOutput_0') parser.parse(spec['tmp_uff'], network) engine = builder.build_cuda_engine(network) buf = engine.serialize() with open(spec['output_bin'], 'wb') as f: f.write(buf)
def __init__(self): # setup tensorrt engine trt_logger = trt.Logger(trt.Logger.INFO) TRTbin = 'ssd-mobilenet-v2-coco.trt' # load plugins trt.init_libnvinfer_plugins(trt_logger, '') # load engine with open(TRTbin, 'rb') as f, trt.Runtime(trt_logger) as runtime: self.engine = runtime.deserialize_cuda_engine(f.read()) # create context self.host_inputs = [] self.host_outputs = [] self.cuda_inputs = [] self.cuda_outputs = [] self.bindings = [] self.stream = cuda.Stream() for binding in self.engine: size = trt.volume(self.engine.get_binding_shape(binding)) * \ self.engine.max_batch_size host_mem = cuda.pagelocked_empty(size, np.float32) cuda_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(cuda_mem)) if self.engine.binding_is_input(binding): self.host_inputs.append(host_mem) self.cuda_inputs.append(cuda_mem) else: self.host_outputs.append(host_mem) self.cuda_outputs.append(cuda_mem) self.context = self.engine.create_execution_context()
def __init__(self, model): # Initialize TRT environment self.input_shape = (300, 300) trt_logger = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(trt_logger, '') with open(model, 'rb') as f, trt.Runtime(trt_logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.host_inputs = [] self.cuda_inputs = [] self.host_outputs = [] self.cuda_outputs = [] self.bindings = [] self.stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size host_mem = cuda.pagelocked_empty(size, np.float32) cuda_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(cuda_mem)) if engine.binding_is_input(binding): self.host_inputs.append(host_mem) self.cuda_inputs.append(cuda_mem) else: self.host_outputs.append(host_mem) self.cuda_outputs.append(cuda_mem) self.context = engine.create_execution_context() self.watch = Stopwatch()
def _load_engine(self, engine_file_path): # Force init TensorRT plugins trt.init_libnvinfer_plugins(None, '') with open(engine_file_path, "rb") as f, \ trt.Runtime(self.trt_logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) return engine
def main(): TRT_LOGGER = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(TRT_LOGGER, '') # compile the model into TensorRT engine model = 'ssd_mobilenet_v2_coco' spec = MODEL_SPECS[model] if not os.path.exists(spec['tmp_uff']): dynamic_graph = add_plugin(gs.DynamicGraph(spec['input_pb']), spec) uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(), output_nodes=['NMS'], output_filename=spec['tmp_uff'], text=True, debug_mode=DEBUG_UFF) with trt.Builder(TRT_LOGGER) as builder, builder.create_network( ) as network, trt.UffParser() as parser: builder.max_workspace_size = 1 << 28 builder.max_batch_size = 1 builder.fp16_mode = True parser.register_input('Input', INPUT_DIMS) parser.register_output('MarkOutput_0') parser.parse(spec['tmp_uff'], network) print("Building Tensorrt engine. This may take a few minutes.") engine = builder.build_cuda_engine(network) buf = engine.serialize() with open(spec['output_bin'], 'wb') as f: f.write(buf) print("Save engine.")
def __init__(self, engine_path, preprocess_fn=bgr8_to_ssd_input): logger = trt.Logger() trt.init_libnvinfer_plugins(logger, '') load_plugins() self.trt_model = TRTModel(engine_path, input_names=[TRT_INPUT_NAME], output_names=[TRT_OUTPUT_NAME, TRT_OUTPUT_NAME + '_1']) self.preprocess_fn = preprocess_fn
def create_trt_model_bin(): ctypes.CDLL(LIB_FLATTEN_PATH) # initialize trt_logger = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(trt_logger, '') # compile model into TensorRT if not os.path.isfile(MODEL_TRT_BIN_PATH): dynamic_graph = model.add_plugin(gs.DynamicGraph(MODEL_PATH)) uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(), model.output_name, output_filename='tmp.uff') with trt.Builder(trt_logger) as builder, builder.create_network() as network, trt.UffParser() as parser: builder.max_workspace_size = 1 << 28 builder.max_batch_size = 1 builder.fp16_mode = True parser.register_input('Input', model.dims) parser.register_output('MarkOutput_0') parser.parse('tmp.uff', network) engine = builder.build_cuda_engine(network) buf = engine.serialize() with open(MODEL_TRT_BIN_PATH, 'wb') as f: f.write(buf)
def run(inDim, outDatatype): print("test", inDim, outDatatype) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, outDatatype) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() context.set_binding_shape(0, inDim) context.set_binding_shape(1, inDim[:1]) context.set_binding_shape(2, inDim[:1]) #print("Bind0->",engine.get_binding_shape(0),context.get_binding_shape(0)); #print("Bind1->",engine.get_binding_shape(1),context.get_binding_shape(1)); #print("Bind2->",engine.get_binding_shape(2),context.get_binding_shape(2)); print("All bind:", context.all_binding_shapes_specified) stream = cuda.Stream() data0 = np.full(inDim, 1, dtype=np.float32) data1 = np.random.randint(1, inDim[2], inDim[:1], dtype=np.int32) data2 = np.random.randint(1, inDim[3], inDim[:1], dtype=np.int32) inputH0 = np.ascontiguousarray(data0) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputH1 = np.ascontiguousarray(data1) inputD1 = cuda.mem_alloc(inputH1.nbytes) inputH2 = np.ascontiguousarray(data2) inputD2 = cuda.mem_alloc(inputH2.nbytes) outputH0 = np.empty(context.get_binding_shape(3), dtype=trt.nptype(engine.get_binding_dtype(3))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) cuda.memcpy_htod_async(inputD2, inputH2, stream) context.execute_async_v2( [int(inputD0), int(inputD1), int(inputD2), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() outputH0CPU = mask2DCPU(inputH0, inputH1, inputH2, globalMask2DTrueValue, globalMask2DFalseValue) #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0)) #print(inputH0) #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1)) #print(inputH1) #print("InputH2->",inputH2.shape, engine.get_binding_dtype(2)) #print(inputH2) #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(3)) #print(outputH0) #print("OutputH0CPU->",outputH0CPU.shape) #print(outputH0CPU) print("Check result:", ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
def __init__(self, topk, detection_threshold, iou_threshold, model_precision, batch_dim, trt_path=None, onnx_export=False): super().__init__() self.topk = torch.nn.Parameter(torch.tensor(topk, dtype=torch.int32), requires_grad=False) self.detection_threshold = torch.nn.Parameter( torch.tensor(detection_threshold), requires_grad=False) self.model_dtype = torch.float16 if model_precision == 'fp16' else torch.float32 self.batch_dim = batch_dim self.class_dim = 81 self.foreground_class_dim = self.class_dim - 1 self.scale_xy = 0.1 self.scale_wh = 0.2 self.scale_xyxywhwh = torch.nn.Parameter(torch.tensor( [self.scale_xy, self.scale_xy, self.scale_wh, self.scale_wh]), requires_grad=False) self.scale_wh_delta = torch.nn.Parameter(torch.tensor( [-0.5, -0.5, 0.5, 0.5]), requires_grad=False) self.iou_threshold = iou_threshold self.dboxes_xywh = torch.nn.Parameter(init_dboxes( self.model_dtype).unsqueeze(dim=0), requires_grad=False) self.box_dim = torch.nn.Parameter(torch.tensor( self.dboxes_xywh.size(1)), requires_grad=False) self.buffer_nchw = torch.nn.Parameter(torch.zeros( (batch_dim, 3, 300, 300), dtype=self.model_dtype), requires_grad=False) self.class_indexes = torch.nn.Parameter(torch.arange( 1, self.class_dim).repeat(self.batch_dim * self.topk), requires_grad=False) self.image_indexes = torch.nn.Parameter( (torch.ones(self.topk * self.foreground_class_dim, dtype=torch.int32) * torch.arange(self.batch_dim).unsqueeze(-1)).view(-1), requires_grad=False) self.onnx_export = onnx_export self.trt_engine = None if trt_path: print('loading TRT engine from', trt_path, '...') self.trt_logger = trt.Logger() trt.init_libnvinfer_plugins(self.trt_logger, '') with open(trt_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: self.trt_engine = runtime.deserialize_cuda_engine(f.read()) self.trt_stream = cuda.Stream() self.trt_context = self.trt_engine.create_execution_context() else: self.detector = torch.hub.load( 'NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=model_precision).eval()
def test_deserialize_engine(engine_name=ENGINE_NAME): import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) trt.init_libnvinfer_plugins(TRT_LOGGER, '') with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) print("load engine!") return engine
def get_plugin_creator(plugin_name): trt.init_libnvinfer_plugins(logger, '') plugin_creator_list = trt.get_plugin_registry().plugin_creator_list plugin_creator = None for c in plugin_creator_list: if c.name == plugin_name: plugin_creator = c return plugin_creator
def get_plugin_creator(plugin_name, logger): """Get the TensorRT plugin creator.""" trt.init_libnvinfer_plugins(logger, '') plugin_creator_list = trt.get_plugin_registry().plugin_creator_list for c in plugin_creator_list: if c.name == plugin_name: return c return None
def get_plugin_creator(self, plugin_name): trt.init_libnvinfer_plugins(TRT_LOGGER, '') plugin_creator_list = trt.get_plugin_registry().plugin_creator_list plugin_creator = None for c in plugin_creator_list: if c.name == plugin_name: plugin_creator = c return plugin_creator
def export_trt(pb_file, output_dir, num_classes=90, neuralet_adaptive_model=1): """ Exports the Tensorflow pb models to TensorRT engines. Args: pb_file: The path of input pb file output_dir: A directory to store the output files num_classes: Detector's number of classes """ lib_flatten_concat_file = "exporters/libflattenconcat.so.6" # initialize if trt.__version__[0] < '7': ctypes.CDLL(lib_flatten_concat_file) TRT_LOGGER = trt.Logger(trt.Logger.WARNING) trt.init_libnvinfer_plugins(TRT_LOGGER, '') # compile the model into TensorRT engine model = "ssd_mobilenet_v2_coco" if not os.path.isfile(pb_file): raise FileNotFoundError( 'model does not exist under: {}'.format(pb_file)) if not os.path.isdir(output_dir): print("the provided output directory : {0} is not exist".format( output_dir)) print("creating output directory : {0}".format(output_dir)) os.makedirs(output_dir, exist_ok=True) dynamic_graph = plugin.add_plugin_and_preprocess(gs.DynamicGraph(pb_file), model, num_classes, neuralet_adaptive_model) model_file_name = ".".join((pb_file.split("/")[-1]).split(".")[:-1]) uff_path = os.path.join(output_dir, model_file_name + ".uff") _ = uff.from_tensorflow(dynamic_graph.as_graph_def(), output_nodes=['NMS'], output_filename=uff_path, text=True, debug_mode=False) input_dims = (3, 300, 300) with trt.Builder(TRT_LOGGER) as builder, builder.create_network( ) as network, builder.create_builder_config( ) as builder_config, trt.UffParser() as parser: builder_config.max_workspace_size = 1 << 28 builder.max_batch_size = 1 builder_config.set_flag(trt.BuilderFlag.FP16) parser.register_input('Input', input_dims) parser.register_output('MarkOutput_0') parser.parse(uff_path, network) engine = builder.build_engine(network, builder_config) buf = engine.serialize() engine_path = os.path.join(output_dir, model_file_name + ".bin") with open(engine_path, 'wb') as f: f.write(buf) print( "your model has been converted to trt engine successfully under : {}" .format(engine_path))
def __init__(self, trt_engine_path, uff_model_path, trt_engine_datatype=trt.DataType.FLOAT, calib_dataset=None, batch_size=1): """ build TensorRT engine """ # load all custom plugins shipepd with TensorRT trt.init_libnvinfer_plugins(TRT_LOGGER, '') # initialize runtime needed for loading TensorRT engine from file self.trt_runtime = trt.Runtime(TRT_LOGGER) # TensorRT engine placeholder trt_engine = TRTEngine() # display requested engine settings to stdout print("TensorRT inference engine settings:") print(" * Inference precision - {}".format(trt_engine_datatype)) print(" * Max batch size - {}\n".format(batch_size)) if not os.path.exists(os.path.dirname(trt_engine_path)): os.mkdir(os.path.dirname(trt_engine_path)) # if engine is not cached, we need to build it if not os.path.exists(trt_engine_path): # this function uses supplied .uff file alongside with UffParser to build TensorRT engine. # For more detials, check implementation trt_engine.build(uff_model_path=uff_model_path, trt_logger=TRT_LOGGER, trt_engine_datatype=trt_engine_datatype, calib_dataset=calib_dataset, batch_size=batch_size) # save the engine to file trt_engine.save(trt_engine_path) else: print("loading cashed TensorRT engine from {}".format( trt_engine_path)) trt_engine.load(self.trt_runtime, trt_engine_path) if trt_engine.engine is None: raise Exception('Error TensorRT engine is not created!!') # this allocates memory for network inputs/outputs on both CPU and GPU self.inputs, self.outputs, self.bindings, self.stream = \ trt_engine.allocate_buffers() # execution context is needed for inference self.context = trt_engine.engine.create_execution_context() # allocate memory for multiple usage [e.g. multiple batch inference] # input_volume = trt.volume(DetectionModel.input_shape) # self.ls_image = np.zeros((trt_engine.max_batch_size, input_volume)) # print(":::: input_volume:", input_volume) # print(":::: input_shape:", DetectionModel.input_shape) self.trt_engine = trt_engine
def __init__(self, trt_deploy_path, trt_engine_path, trt_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1): """Initializes TensorRT objects needed for model inference. Args: trt_engine_path (str): path where TensorRT engine should be stored trt_model_path (str): path of caffe model trt_engine_datatype (trt.DataType): requested precision of TensorRT engine used for inference batch_size (int): batch size for which engine should be optimized for """ # We first load all custom plugins shipped with TensorRT, # some of them will be needed during inference trt.init_libnvinfer_plugins(TRT_LOGGER, '') # Initialize runtime needed for loading TensorRT engine from file self.trt_runtime = trt.Runtime(TRT_LOGGER) # TRT engine placeholder self.trt_engine = None self.datatype = DATATYPE[trt_engine_datatype] # Display requested engine settings to stdout print("TensorRT inference engine settings:") print(" * Inference precision - {}".format(trt_engine_datatype)) print(" * Max batch size - {}\n".format(batch_size)) # If engine is not cached, we need to build it if not os.path.exists(trt_engine_path): # For more details, check implmentation self.trt_engine = engine_utils.build_engine( trt_deploy_path, trt_model_path, TRT_LOGGER, trt_engine_datatype=trt_engine_datatype, batch_size=batch_size) print("self.trt_engine:",self.trt_engine) # Save the engine to file engine_utils.save_engine(self.trt_engine, trt_engine_path) # If we get here, the file with engine exists, so we can load it if not self.trt_engine: print("Loading cached TensorRT engine from {}".format( trt_engine_path)) self.trt_engine = engine_utils.load_engine( self.trt_runtime, trt_engine_path) # This allocates memory for network inputs/outputs on both CPU and GPU self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(self.trt_engine) # engine_utils.allocate_buffers(self.trt_engine) # common.allocate_buffers(self.trt_engine) # Execution context is needed for inference self.context = self.trt_engine.create_execution_context() # Allocate memory for multiple usage [e.g. multiple batch inference] input_volume = trt.volume(model_utils.ModelData.INPUT_SHAPE) print("input_volume:",input_volume) print("self.trt_engine.max_batch_size:",self.trt_engine.max_batch_size) self.numpy_array = np.zeros((self.trt_engine.max_batch_size, input_volume), dtype=np.float32) self.im_info = np.zeros((self.trt_engine.max_batch_size, 3), dtype=np.float32)
def __init__(self, trt_logger_severity=trt.Logger.INFO): """ :param trt_logger_severity: """ self._trt_logger_severity = trt_logger_severity self._TRT_LOGGER = trt.Logger(trt_logger_severity) trt.init_libnvinfer_plugins(self._TRT_LOGGER, "")
def load_plugins(): trt.init_libnvinfer_plugins(TRT_LOGGER, '') libname = 'libflattenconcat.so' basedir = os.path.abspath(os.path.dirname(__file__)) libpath = os.path.join(basedir, libname) if os.path.exists(libpath): ctypes.CDLL(libpath) else: ctypes.CDLL(libname)
def __init__(self, trt_path, gpuID): self.cfx = cuda.Device(gpuID).make_context() self.stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(TRT_LOGGER, '') runtime = trt.Runtime(TRT_LOGGER) with open(trt_path, 'rb') as f: buf = f.read() self.engine = runtime.deserialize_cuda_engine(buf) self.context = self.engine.create_execution_context() self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers( self.engine)
def run(inDim, inDatatype): print("test", inDim, inDatatype) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, inDatatype, len(inDim)) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() context.set_binding_shape(0, inDim) context.set_binding_shape(1, inDim[:1]) #print("Bind0->",engine.get_binding_shape(0),context.get_binding_shape(0)) #print("Bind1->",engine.get_binding_shape(1),context.get_binding_shape(1)) #print("Bind2->",engine.get_binding_shape(2),context.get_binding_shape(2)) #print("All bind:",context.all_binding_shapes_specified) stream = cuda.Stream() data0 = np.arange(np.prod(inDim), dtype=inDatatype).reshape(inDim) data1 = np.arange(1, inDim[0] + 1, dtype=np.int32) data1[data1 > inDim[1]] = inDim[1] inputH0 = np.ascontiguousarray(data0) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputH1 = np.ascontiguousarray(data1) inputD1 = cuda.mem_alloc(inputH1.nbytes) outputH0 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) context.execute_async_v2( [int(inputD0), int(inputD1), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() outputH0CPU = reverseCPU(inputH0, inputH1) #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0)) #print(inputH0) #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1)) #print(inputH1) #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(2)) #print(cleanTrash(outputH0,inputH1)) #print("OutputH0CPU->",outputH0CPU.shape) #print(outputH0CPU) print("Check result:", [ "True" if np.all( cleanTrash(outputH0, inputH1) == outputH0CPU) else "False" ][0])
def __init__( self, verbose: bool = False, workspace: int = 4, precision: str = "fp32", enable_dynamic: bool = False, max_batch_size: int = 16, calib_input: Optional[str] = None, calib_cache: Optional[str] = None, calib_num_images: int = 5000, calib_batch_size: int = 8, ): """ Args: verbose (bool): If enabled, a higher verbosity level will be set on the TensorRT logger. Default: False workspace (int): Max memory workspace to allow, in Gb. Default: 4 precision (string): The datatype to use for the engine inference, either 'fp32', 'fp16' or 'int8'. Default: 'fp32' enable_dynamic (bool): Whether to enable dynamic shapes. Default: False max_batch_size (int): Maximum batch size reserved for dynamic shape inference. Default: 16 calib_input (string, optinal): The path to a directory holding the calibration images. Default: None calib_cache (string, optinal): The path where to write the calibration cache to, or if it already exists, load it from. Default: None calib_num_images (int): The maximum number of images to use for calibration. Default: 5000 calib_batch_size (int): The batch size to use for the calibration process. Default: 8 """ self.logger = trt.Logger(trt.Logger.INFO) if verbose: self.logger.min_severity = trt.Logger.Severity.VERBOSE trt.init_libnvinfer_plugins(self.logger, namespace="") self.builder = trt.Builder(self.logger) self.config = self.builder.create_builder_config() self.config.max_workspace_size = workspace * 1 << 30 self.batch_size = None self.network = None self.parser = None # Leaving some interfaces and parameters for subsequent use, but we have not yet # implemented the following functionality self.precision = precision self.enable_dynamic = enable_dynamic self.max_batch_size = max_batch_size self.calib_input = calib_input self.calib_cache = calib_cache self.calib_num_images = calib_num_images self.calib_batch_size = calib_batch_size
def __init__(self, engine_path, preprocess_fn=bgr8_to_ssd_input): from .tensorrt_model import TRTModel from jnmouse.ssd_tensorrt import parse_boxes, TRT_INPUT_NAME, TRT_OUTPUT_NAME logger = trt.Logger() trt.init_libnvinfer_plugins(logger, '') load_plugins() self.trt_model = TRTModel(engine_path) ## If you want to specify input and output, use the following instead of the above. # self.trt_model = TRTModel(engine_path, input_names=[TRT_INPUT_NAME], # output_names=[TRT_OUTPUT_NAME, TRT_OUTPUT_NAME + '_1']) self.preprocess_fn = preprocess_fn self.postprocess_fn = parse_boxes
def run(nGroup, xWidth, yWidth, h, dim_t, datatype): print("test [%d,%d/%d,%d],dim_t=%d" % (nGroup, xWidth, yWidth, h, dim_t), datatype) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) weight = np.full([h, dim_t, h], 0.1, dtype=np.float32) engine = buildEngine(logger, [nGroup, max(xWidth, yWidth), h], dim_t, weight, datatype) if engine == None: print("Failed building engine!") return None print("Succeed building engine!") context = engine.create_execution_context() context.set_binding_shape(0, [nGroup, xWidth, h]) context.set_binding_shape(1, [nGroup, yWidth, h]) #print("Binding0->",engine.get_binding_shape(0),context.get_binding_shape(0)) #print("Binding1->",engine.get_binding_shape(1),context.get_binding_shape(1)) #print("Binding2->",engine.get_binding_shape(2),context.get_binding_shape(2)) #print("All bind:",context.all_binding_shapes_specified) stream = cuda.Stream() data0 = np.ones([nGroup, xWidth, h], dtype=datatype) data1 = np.ones([nGroup, yWidth, h], dtype=datatype) inputH0 = np.ascontiguousarray(data0) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputH1 = np.ascontiguousarray(data1) inputD1 = cuda.mem_alloc(inputH1.nbytes) outputH0 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) context.execute_async_v2( [int(inputD0), int(inputD1), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() outputH0CPU = MMTCPU(inputH0, inputH1, weight) #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0)) #print(inputH0) #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1)) #print(inputH1) #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(2)) #print(outputH0) print("Check result:", ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])