예제 #1
0
    def __init__(self,
                 model_name='frozen_inference_graph',
                 input_shape=(300, 300),
                 cuda_ctx=None):

        trt.init_libnvinfer_plugins(TRT_LOGGER, '')
        import pycuda.autoinit
        self.inputDims = (3, input_shape[0], input_shape[1])
        self.input_shape = input_shape
        model_loc = os.path.join(WORK_DIR, MODEL_DIR, model_name + '.pb')
        uff_loc = os.path.join(WORK_DIR, MODEL_DIR, model_name + '.uff')
        bin_loc = os.path.join(WORK_DIR, MODEL_DIR, model_name + '.bin')
        self.spec = {
            'input_pb': model_loc,
            'tmp_uff': uff_loc,
            'output_bin': bin_loc,
            'num_classes': 91,
            'min_size': 0.2,
            'max_size': 0.95,
            'input_order': [0, 2,
                            1],  # order of loc_data, conf_data, priorbox_data
        }
        if not os.path.isfile(bin_loc):
            self.convert()
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx:
            self.cuda_ctx.push()

        self.trt_logger = trt.Logger(trt.Logger.INFO)
        self.engine = self._load_engine()
        self.set_context()
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()

    inputH0 = np.ascontiguousarray(np.random.rand(nElement).astype(np.float32).reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputH1 = np.ascontiguousarray(np.random.rand(nElement, nWidth).astype(np.float32).reshape(-1))
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    outputH0 = np.empty(engine.get_binding_shape(2), dtype=np.float32)
    outputD0 = cuda.mem_alloc(outputH0.nbytes)
    outputH1 = np.empty(engine.get_binding_shape(3), dtype=np.float32)
    outputD1 = cuda.mem_alloc(outputH1.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    context.execute_async(1, [int(inputD0), int(inputD1), int(outputD0), int(outputD1)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    cuda.memcpy_dtoh_async(outputH1, outputD1, stream)
    stream.synchronize()

    outputCPUH0, outputCPUH1 = sortCPU(inputH0, inputH1.reshape(nElement, nWidth))

    print(np.shape(outputH0), np.shape(outputH1))
    print("Check result Key:", "True" if np.mean(np.abs(outputH0.reshape(-1) - outputCPUH0.reshape(-1))) < epsilon else "False")
    print("Check result Value:", "True" if np.mean(np.abs(outputH1.reshape(-1) - outputCPUH1.reshape(-1))) < epsilon else "False")
    '''
예제 #3
0
def run(batchSize, shape):
    print("test", batchSize, *shape)
    logger = trt.Logger(trt.Logger.INFO)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)

    engine = buildEngine(logger, shape)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()

    data = np.array(np.random.rand(batchSize, *shape) * 2 - 1, dtype=np.float32)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    outputH0 = np.empty((batchSize, ) + tuple(context.get_binding_shape(1)), dtype=trt.nptype(engine.get_binding_dtype(1)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    context.execute_async(batchSize, [int(inputD0), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    stream.synchronize()

    #print("data:", np.shape(data), data.dtype, np.mean(data), np.var(data), np.max(data), np.min(data))
    #print(data)
    #print("hOut:", np.shape(outputH0), outputH0.dtype, np.mean(outputH0), np.var(outputH0), np.max(outputH0), np.min(outputH0))
    #print(outputH0)
    print("check result:", np.all(np.sign(data) == outputH0), "\n")
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()
    data = np.array([
        7, 5, 6, 4, 4, 2, 5, 3, 3, 9, 9, 7, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
        12
    ]).reshape(2, 3, 4).astype(np.float32)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    context.execute_async(1, [int(inputD0), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)
    stream.synchronize()

    print("input=\n", data)
    print("real output=\n", outputH0)
예제 #5
0
def run(nBatchSize, shape, isSum):
    print("test", nBatchSize, shape, isSum)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger, shape, isSum)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    stream = cuda.Stream()
    data = np.random.rand(*[nBatchSize, *shape]).astype(np.float32)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    outputH0 = np.empty((nBatchSize, ) + tuple(context.get_binding_shape(1)),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    outpuD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    context.execute_async(nBatchSize,
                          [int(inputD0), int(outpuD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outpuD0, stream)
    stream.synchronize()
    outputH0CPU = reduceCPU(data, isSum)

    print("Check result:",
          ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
    '''
예제 #6
0
    def load(self, model_path: Union[str, Path], **_) -> Model:
        model_path = Path(model_path)
        LOGGER.debug(f"Loading TensorRT engine from {model_path}")
        engine = self._load_engine(model_path)

        if engine is None:
            LOGGER.debug(
                "Unable to load engine without plugins. Loading plugins.")
            trt.init_libnvinfer_plugins(logger=TRT_LOGGER, namespace="")
            LOGGER.debug(
                f"Loading TensorRT engine with plugins from {model_path}")
            engine = self._load_engine(model_path)

        if engine is None:
            raise RuntimeError(f"Could not load ICudaEngine from {model_path}")

        inputs = {}
        outputs = {}
        for binding_idx in range(engine.num_bindings):
            name = engine.get_binding_name(binding_idx)
            is_input = engine.binding_is_input(binding_idx)
            dtype = np.dtype(trt.nptype(
                engine.get_binding_dtype(binding_idx))).name
            shape = engine.get_binding_shape(binding_idx)
            if is_input:
                inputs[name] = TensorSpec(name, dtype, shape)
            else:
                outputs[name] = TensorSpec(name, dtype, shape)

        return Model(engine, None, inputs, outputs)
예제 #7
0
    def __init__(
        self,
        trt_engine_path,
        onnx_model_path,
        trt_engine_datatype=trt.DataType.FLOAT,
        calib_dataset=None,
        batch_size=1,
    ):
        """Initializes TensorRT objects needed for model inference.

        Args:
            trt_engine_path (str): path where TensorRT engine should be stored
            onnx_model_path (str): path of .onnx model
            trt_engine_datatype (trt.DataType):
                requested precision of TensorRT engine used for inference
            batch_size (int): batch size for which engine
                should be optimized for
        """
        # Suppressed informational messages, and report only warnings and errors
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

        self.batch_size = batch_size
        # We first load all custom plugins shipped with TensorRT,
        # some of them will be needed during inference
        trt.init_libnvinfer_plugins(TRT_LOGGER, '')

        # TRT engine placeholder
        self.trt_engine = None

        # Display requested engine settings to stdout
        print(">>> TensorRT inference engine settings:")
        print(">>> Inference precision: {}".format(trt_engine_datatype))
        print(">>> Max batch size: {}".format(batch_size))

        # If engine is not cached, we need to build it
        if not os.path.exists(trt_engine_path):
            # This function uses supplied .onnx file
            # alongside with ONNXParser to build TensorRT
            # engine. For more details, check implmentation
            self.trt_engine = build_engine(onnx_model_path, TRT_LOGGER)
            # Save the engine to file
            save_engine(self.trt_engine, trt_engine_path)

        # If we get here, the file with engine exists, so we can load it
        if not self.trt_engine:
            print(">>> Loading cached TensorRT engine from {}".format(
                trt_engine_path))
            self.trt_engine = load_engine(trt_engine_path, TRT_LOGGER)

        # This allocates memory for network inputs/outputs on both CPU and GPU
        self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(
            self.trt_engine)

        # Execution context is needed for inference
        self.context = self.trt_engine.create_execution_context()

        # Allocate memory for multiple usage [e.g. multiple batch inference]
        input_volume = trt.volume(ModelData.INPUT_SHAPE)
        self.numpy_array = np.zeros(
            (self.trt_engine.max_batch_size, input_volume))
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('model', type=str, choices=list(MODEL_SPECS.keys()))
    args = parser.parse_args()

    # initialize
    if trt.__version__[0] < '7':
        ctypes.CDLL(LIB_FILE)
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    # compile the model into TensorRT engine
    model = args.model
    spec = MODEL_SPECS[model]
    dynamic_graph = add_plugin(gs.DynamicGraph(spec['input_pb']), model, spec)
    _ = uff.from_tensorflow(dynamic_graph.as_graph_def(),
                            output_nodes=['NMS'],
                            output_filename=spec['tmp_uff'],
                            text=True,
                            debug_mode=DEBUG_UFF)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, trt.UffParser() as parser:
        builder.max_workspace_size = 1 << 28
        builder.max_batch_size = 1
        builder.fp16_mode = True

        parser.register_input('Input', INPUT_DIMS)
        parser.register_output('MarkOutput_0')
        parser.parse(spec['tmp_uff'], network)
        engine = builder.build_cuda_engine(network)

        buf = engine.serialize()
        with open(spec['output_bin'], 'wb') as f:
            f.write(buf)
예제 #9
0
    def __init__(self):

        
        # setup tensorrt engine
        trt_logger = trt.Logger(trt.Logger.INFO)
        TRTbin = 'ssd-mobilenet-v2-coco.trt'

        # load plugins
        trt.init_libnvinfer_plugins(trt_logger, '')

        # load engine
        with open(TRTbin, 'rb') as f, trt.Runtime(trt_logger) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())

        # create context
        self.host_inputs = []
        self.host_outputs = []
        self.cuda_inputs = []
        self.cuda_outputs = []
        self.bindings = []
        self.stream = cuda.Stream()
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * \
                    self.engine.max_batch_size
            host_mem = cuda.pagelocked_empty(size, np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            self.bindings.append(int(cuda_mem))
            if self.engine.binding_is_input(binding):
                self.host_inputs.append(host_mem)
                self.cuda_inputs.append(cuda_mem)
            else:
                self.host_outputs.append(host_mem)
                self.cuda_outputs.append(cuda_mem)
            
            self.context = self.engine.create_execution_context()
예제 #10
0
    def __init__(self, model):

        # Initialize TRT environment
        self.input_shape = (300, 300)
        trt_logger = trt.Logger(trt.Logger.INFO)
        trt.init_libnvinfer_plugins(trt_logger, '')
        with open(model, 'rb') as f, trt.Runtime(trt_logger) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())

        self.host_inputs = []
        self.cuda_inputs = []
        self.host_outputs = []
        self.cuda_outputs = []
        self.bindings = []
        self.stream = cuda.Stream()

        for binding in engine:
            size = trt.volume(
                engine.get_binding_shape(binding)) * engine.max_batch_size
            host_mem = cuda.pagelocked_empty(size, np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            self.bindings.append(int(cuda_mem))
            if engine.binding_is_input(binding):
                self.host_inputs.append(host_mem)
                self.cuda_inputs.append(cuda_mem)
            else:
                self.host_outputs.append(host_mem)
                self.cuda_outputs.append(cuda_mem)
        self.context = engine.create_execution_context()

        self.watch = Stopwatch()
예제 #11
0
 def _load_engine(self, engine_file_path):
     # Force init TensorRT plugins
     trt.init_libnvinfer_plugins(None, '')
     with open(engine_file_path, "rb") as f, \
             trt.Runtime(self.trt_logger) as runtime:
         engine = runtime.deserialize_cuda_engine(f.read())
     return engine
예제 #12
0
def main():
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    # compile the model into TensorRT engine
    model = 'ssd_mobilenet_v2_coco'
    spec = MODEL_SPECS[model]
    if not os.path.exists(spec['tmp_uff']):
        dynamic_graph = add_plugin(gs.DynamicGraph(spec['input_pb']), spec)
        uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(),
                                        output_nodes=['NMS'],
                                        output_filename=spec['tmp_uff'],
                                        text=True,
                                        debug_mode=DEBUG_UFF)

    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, trt.UffParser() as parser:
        builder.max_workspace_size = 1 << 28
        builder.max_batch_size = 1
        builder.fp16_mode = True

        parser.register_input('Input', INPUT_DIMS)
        parser.register_output('MarkOutput_0')
        parser.parse(spec['tmp_uff'], network)

        print("Building Tensorrt engine. This may take a few minutes.")

        engine = builder.build_cuda_engine(network)

        buf = engine.serialize()
        with open(spec['output_bin'], 'wb') as f:
            f.write(buf)
            print("Save engine.")
예제 #13
0
 def __init__(self, engine_path, preprocess_fn=bgr8_to_ssd_input):
     logger = trt.Logger()
     trt.init_libnvinfer_plugins(logger, '')
     load_plugins()
     self.trt_model = TRTModel(engine_path, input_names=[TRT_INPUT_NAME],
                               output_names=[TRT_OUTPUT_NAME, TRT_OUTPUT_NAME + '_1'])
     self.preprocess_fn = preprocess_fn
예제 #14
0
def create_trt_model_bin():
    ctypes.CDLL(LIB_FLATTEN_PATH)

    # initialize
    trt_logger = trt.Logger(trt.Logger.INFO)
    trt.init_libnvinfer_plugins(trt_logger, '')

    # compile model into TensorRT
    if not os.path.isfile(MODEL_TRT_BIN_PATH):
        dynamic_graph = model.add_plugin(gs.DynamicGraph(MODEL_PATH))
        uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(), model.output_name, output_filename='tmp.uff')

        with trt.Builder(trt_logger) as builder, builder.create_network() as network, trt.UffParser() as parser:
            builder.max_workspace_size = 1 << 28
            builder.max_batch_size = 1
            builder.fp16_mode = True

            parser.register_input('Input', model.dims)
            parser.register_output('MarkOutput_0')
            parser.parse('tmp.uff', network)
            engine = builder.build_cuda_engine(network)

            buf = engine.serialize()
            with open(MODEL_TRT_BIN_PATH, 'wb') as f:
                f.write(buf)
def run(inDim, outDatatype):
    print("test", inDim, outDatatype)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger, outDatatype)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    context.set_binding_shape(0, inDim)
    context.set_binding_shape(1, inDim[:1])
    context.set_binding_shape(2, inDim[:1])
    #print("Bind0->",engine.get_binding_shape(0),context.get_binding_shape(0));
    #print("Bind1->",engine.get_binding_shape(1),context.get_binding_shape(1));
    #print("Bind2->",engine.get_binding_shape(2),context.get_binding_shape(2));
    print("All bind:", context.all_binding_shapes_specified)
    stream = cuda.Stream()

    data0 = np.full(inDim, 1, dtype=np.float32)
    data1 = np.random.randint(1, inDim[2], inDim[:1], dtype=np.int32)
    data2 = np.random.randint(1, inDim[3], inDim[:1], dtype=np.int32)
    inputH0 = np.ascontiguousarray(data0)
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputH1 = np.ascontiguousarray(data1)
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    inputH2 = np.ascontiguousarray(data2)
    inputD2 = cuda.mem_alloc(inputH2.nbytes)
    outputH0 = np.empty(context.get_binding_shape(3),
                        dtype=trt.nptype(engine.get_binding_dtype(3)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    cuda.memcpy_htod_async(inputD2, inputH2, stream)
    context.execute_async_v2(
        [int(inputD0), int(inputD1),
         int(inputD2), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)

    stream.synchronize()

    outputH0CPU = mask2DCPU(inputH0, inputH1, inputH2, globalMask2DTrueValue,
                            globalMask2DFalseValue)

    #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0))
    #print(inputH0)
    #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1))
    #print(inputH1)
    #print("InputH2->",inputH2.shape, engine.get_binding_dtype(2))
    #print(inputH2)
    #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(3))
    #print(outputH0)
    #print("OutputH0CPU->",outputH0CPU.shape)
    #print(outputH0CPU)
    print("Check result:",
          ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
 def __init__(self,
              topk,
              detection_threshold,
              iou_threshold,
              model_precision,
              batch_dim,
              trt_path=None,
              onnx_export=False):
     super().__init__()
     self.topk = torch.nn.Parameter(torch.tensor(topk, dtype=torch.int32),
                                    requires_grad=False)
     self.detection_threshold = torch.nn.Parameter(
         torch.tensor(detection_threshold), requires_grad=False)
     self.model_dtype = torch.float16 if model_precision == 'fp16' else torch.float32
     self.batch_dim = batch_dim
     self.class_dim = 81
     self.foreground_class_dim = self.class_dim - 1
     self.scale_xy = 0.1
     self.scale_wh = 0.2
     self.scale_xyxywhwh = torch.nn.Parameter(torch.tensor(
         [self.scale_xy, self.scale_xy, self.scale_wh, self.scale_wh]),
                                              requires_grad=False)
     self.scale_wh_delta = torch.nn.Parameter(torch.tensor(
         [-0.5, -0.5, 0.5, 0.5]),
                                              requires_grad=False)
     self.iou_threshold = iou_threshold
     self.dboxes_xywh = torch.nn.Parameter(init_dboxes(
         self.model_dtype).unsqueeze(dim=0),
                                           requires_grad=False)
     self.box_dim = torch.nn.Parameter(torch.tensor(
         self.dboxes_xywh.size(1)),
                                       requires_grad=False)
     self.buffer_nchw = torch.nn.Parameter(torch.zeros(
         (batch_dim, 3, 300, 300), dtype=self.model_dtype),
                                           requires_grad=False)
     self.class_indexes = torch.nn.Parameter(torch.arange(
         1, self.class_dim).repeat(self.batch_dim * self.topk),
                                             requires_grad=False)
     self.image_indexes = torch.nn.Parameter(
         (torch.ones(self.topk * self.foreground_class_dim,
                     dtype=torch.int32) *
          torch.arange(self.batch_dim).unsqueeze(-1)).view(-1),
         requires_grad=False)
     self.onnx_export = onnx_export
     self.trt_engine = None
     if trt_path:
         print('loading TRT engine from', trt_path, '...')
         self.trt_logger = trt.Logger()
         trt.init_libnvinfer_plugins(self.trt_logger, '')
         with open(trt_path,
                   'rb') as f, trt.Runtime(self.trt_logger) as runtime:
             self.trt_engine = runtime.deserialize_cuda_engine(f.read())
             self.trt_stream = cuda.Stream()
             self.trt_context = self.trt_engine.create_execution_context()
     else:
         self.detector = torch.hub.load(
             'NVIDIA/DeepLearningExamples:torchhub',
             'nvidia_ssd',
             model_math=model_precision).eval()
예제 #17
0
def test_deserialize_engine(engine_name=ENGINE_NAME):
    import tensorrt as trt
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')
    with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    print("load engine!")
    return engine
예제 #18
0
def get_plugin_creator(plugin_name):
    trt.init_libnvinfer_plugins(logger, '')
    plugin_creator_list = trt.get_plugin_registry().plugin_creator_list
    plugin_creator = None
    for c in plugin_creator_list:
        if c.name == plugin_name:
            plugin_creator = c
    return plugin_creator
예제 #19
0
def get_plugin_creator(plugin_name, logger):
    """Get the TensorRT plugin creator."""
    trt.init_libnvinfer_plugins(logger, '')
    plugin_creator_list = trt.get_plugin_registry().plugin_creator_list
    for c in plugin_creator_list:
        if c.name == plugin_name:
            return c
    return None
 def get_plugin_creator(self, plugin_name):
     trt.init_libnvinfer_plugins(TRT_LOGGER, '')
     plugin_creator_list = trt.get_plugin_registry().plugin_creator_list
     plugin_creator = None
     for c in plugin_creator_list:
         if c.name == plugin_name:
             plugin_creator = c
     return plugin_creator
def export_trt(pb_file, output_dir, num_classes=90, neuralet_adaptive_model=1):
    """
    Exports the Tensorflow pb models to TensorRT engines.
    Args:
        pb_file: The path of input pb file
        output_dir: A directory to store the output files
        num_classes: Detector's number of classes
    """
    lib_flatten_concat_file = "exporters/libflattenconcat.so.6"
    # initialize
    if trt.__version__[0] < '7':
        ctypes.CDLL(lib_flatten_concat_file)
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    # compile the model into TensorRT engine
    model = "ssd_mobilenet_v2_coco"

    if not os.path.isfile(pb_file):
        raise FileNotFoundError(
            'model does not exist under: {}'.format(pb_file))

    if not os.path.isdir(output_dir):
        print("the provided output directory : {0} is not exist".format(
            output_dir))
        print("creating output directory : {0}".format(output_dir))
        os.makedirs(output_dir, exist_ok=True)

    dynamic_graph = plugin.add_plugin_and_preprocess(gs.DynamicGraph(pb_file),
                                                     model, num_classes,
                                                     neuralet_adaptive_model)
    model_file_name = ".".join((pb_file.split("/")[-1]).split(".")[:-1])
    uff_path = os.path.join(output_dir, model_file_name + ".uff")
    _ = uff.from_tensorflow(dynamic_graph.as_graph_def(),
                            output_nodes=['NMS'],
                            output_filename=uff_path,
                            text=True,
                            debug_mode=False)
    input_dims = (3, 300, 300)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
    ) as network, builder.create_builder_config(
    ) as builder_config, trt.UffParser() as parser:
        builder_config.max_workspace_size = 1 << 28
        builder.max_batch_size = 1
        builder_config.set_flag(trt.BuilderFlag.FP16)

        parser.register_input('Input', input_dims)
        parser.register_output('MarkOutput_0')
        parser.parse(uff_path, network)
        engine = builder.build_engine(network, builder_config)

        buf = engine.serialize()
        engine_path = os.path.join(output_dir, model_file_name + ".bin")
        with open(engine_path, 'wb') as f:
            f.write(buf)
        print(
            "your model has been converted to trt engine successfully under : {}"
            .format(engine_path))
예제 #22
0
    def __init__(self,
                 trt_engine_path,
                 uff_model_path,
                 trt_engine_datatype=trt.DataType.FLOAT,
                 calib_dataset=None,
                 batch_size=1):
        """ build TensorRT engine """

        # load all custom plugins shipepd with TensorRT
        trt.init_libnvinfer_plugins(TRT_LOGGER, '')

        # initialize runtime needed for loading TensorRT engine from file
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        # TensorRT engine placeholder
        trt_engine = TRTEngine()

        # display requested engine settings to stdout
        print("TensorRT inference engine settings:")
        print("  * Inference precision - {}".format(trt_engine_datatype))
        print("  * Max batch size - {}\n".format(batch_size))

        if not os.path.exists(os.path.dirname(trt_engine_path)):
            os.mkdir(os.path.dirname(trt_engine_path))

        # if engine is not cached, we need to build it
        if not os.path.exists(trt_engine_path):
            # this function uses supplied .uff file alongside with UffParser to build TensorRT engine.
            # For more detials, check implementation
            trt_engine.build(uff_model_path=uff_model_path,
                             trt_logger=TRT_LOGGER,
                             trt_engine_datatype=trt_engine_datatype,
                             calib_dataset=calib_dataset,
                             batch_size=batch_size)

            # save the engine to file
            trt_engine.save(trt_engine_path)
        else:
            print("loading cashed TensorRT engine from {}".format(
                trt_engine_path))
            trt_engine.load(self.trt_runtime, trt_engine_path)

        if trt_engine.engine is None:
            raise Exception('Error TensorRT engine is not created!!')

        # this allocates memory for network inputs/outputs on both CPU and GPU
        self.inputs, self.outputs, self.bindings, self.stream = \
            trt_engine.allocate_buffers()

        # execution context is needed for inference
        self.context = trt_engine.engine.create_execution_context()

        # allocate memory for multiple usage [e.g. multiple batch inference]
        # input_volume = trt.volume(DetectionModel.input_shape)
        # self.ls_image = np.zeros((trt_engine.max_batch_size, input_volume))
        # print(":::: input_volume:", input_volume)
        # print(":::: input_shape:", DetectionModel.input_shape)

        self.trt_engine = trt_engine
예제 #23
0
    def __init__(self, trt_deploy_path, trt_engine_path, trt_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1):
        """Initializes TensorRT objects needed for model inference.

        Args:
            trt_engine_path (str): path where TensorRT engine should be stored
            trt_model_path (str): path of caffe model
            trt_engine_datatype (trt.DataType):
                requested precision of TensorRT engine used for inference
            batch_size (int): batch size for which engine
                should be optimized for
        """

        # We first load all custom plugins shipped with TensorRT,
        # some of them will be needed during inference
        trt.init_libnvinfer_plugins(TRT_LOGGER, '')

        # Initialize runtime needed for loading TensorRT engine from file
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        # TRT engine placeholder
        self.trt_engine = None

        self.datatype = DATATYPE[trt_engine_datatype]
        # Display requested engine settings to stdout
        print("TensorRT inference engine settings:")
        print("  * Inference precision - {}".format(trt_engine_datatype))
        print("  * Max batch size - {}\n".format(batch_size))

        # If engine is not cached, we need to build it
        if not os.path.exists(trt_engine_path):
           # For more details, check implmentation
            self.trt_engine = engine_utils.build_engine(
                trt_deploy_path, trt_model_path, TRT_LOGGER,
                trt_engine_datatype=trt_engine_datatype,
                batch_size=batch_size)
            print("self.trt_engine:",self.trt_engine)
            # Save the engine to file
            engine_utils.save_engine(self.trt_engine, trt_engine_path)

        # If we get here, the file with engine exists, so we can load it
        if not self.trt_engine:
            print("Loading cached TensorRT engine from {}".format(
                trt_engine_path))
            self.trt_engine = engine_utils.load_engine(
                self.trt_runtime, trt_engine_path)

        # This allocates memory for network inputs/outputs on both CPU and GPU
        self.inputs, self.outputs, self.bindings, self.stream =  common.allocate_buffers(self.trt_engine)
          #  engine_utils.allocate_buffers(self.trt_engine)
          #  common.allocate_buffers(self.trt_engine)
        # Execution context is needed for inference
        self.context = self.trt_engine.create_execution_context()

        # Allocate memory for multiple usage [e.g. multiple batch inference]
        input_volume = trt.volume(model_utils.ModelData.INPUT_SHAPE)
        print("input_volume:",input_volume)
        print("self.trt_engine.max_batch_size:",self.trt_engine.max_batch_size)
        self.numpy_array = np.zeros((self.trt_engine.max_batch_size, input_volume), dtype=np.float32)
        self.im_info = np.zeros((self.trt_engine.max_batch_size, 3), dtype=np.float32)
예제 #24
0
    def __init__(self, trt_logger_severity=trt.Logger.INFO):
        """

        :param trt_logger_severity:
        """

        self._trt_logger_severity = trt_logger_severity
        self._TRT_LOGGER = trt.Logger(trt_logger_severity)
        trt.init_libnvinfer_plugins(self._TRT_LOGGER, "")
예제 #25
0
파일: engine.py 프로젝트: swipswaps/watsor
def load_plugins():
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    libname = 'libflattenconcat.so'
    basedir = os.path.abspath(os.path.dirname(__file__))
    libpath = os.path.join(basedir, libname)
    if os.path.exists(libpath):
        ctypes.CDLL(libpath)
    else:
        ctypes.CDLL(libname)
 def __init__(self, trt_path, gpuID):
     self.cfx = cuda.Device(gpuID).make_context()
     self.stream = cuda.Stream()
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     trt.init_libnvinfer_plugins(TRT_LOGGER, '')
     runtime = trt.Runtime(TRT_LOGGER)
     with open(trt_path, 'rb') as f:
         buf = f.read()
         self.engine = runtime.deserialize_cuda_engine(buf)
     self.context = self.engine.create_execution_context()
     self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(
         self.engine)
def run(inDim, inDatatype):
    print("test", inDim, inDatatype)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    engine = buildEngine(logger, inDatatype, len(inDim))
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    context.set_binding_shape(0, inDim)
    context.set_binding_shape(1, inDim[:1])
    #print("Bind0->",engine.get_binding_shape(0),context.get_binding_shape(0))
    #print("Bind1->",engine.get_binding_shape(1),context.get_binding_shape(1))
    #print("Bind2->",engine.get_binding_shape(2),context.get_binding_shape(2))
    #print("All bind:",context.all_binding_shapes_specified)
    stream = cuda.Stream()

    data0 = np.arange(np.prod(inDim), dtype=inDatatype).reshape(inDim)
    data1 = np.arange(1, inDim[0] + 1, dtype=np.int32)
    data1[data1 > inDim[1]] = inDim[1]
    inputH0 = np.ascontiguousarray(data0)
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputH1 = np.ascontiguousarray(data1)
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    outputH0 = np.empty(context.get_binding_shape(2),
                        dtype=trt.nptype(engine.get_binding_dtype(2)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    context.execute_async_v2(
        [int(inputD0), int(inputD1), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)

    stream.synchronize()

    outputH0CPU = reverseCPU(inputH0, inputH1)

    #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0))
    #print(inputH0)
    #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1))
    #print(inputH1)
    #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(2))
    #print(cleanTrash(outputH0,inputH1))
    #print("OutputH0CPU->",outputH0CPU.shape)
    #print(outputH0CPU)
    print("Check result:", [
        "True" if np.all(
            cleanTrash(outputH0, inputH1) == outputH0CPU) else "False"
    ][0])
예제 #28
0
    def __init__(
        self,
        verbose: bool = False,
        workspace: int = 4,
        precision: str = "fp32",
        enable_dynamic: bool = False,
        max_batch_size: int = 16,
        calib_input: Optional[str] = None,
        calib_cache: Optional[str] = None,
        calib_num_images: int = 5000,
        calib_batch_size: int = 8,
    ):
        """
        Args:
            verbose (bool): If enabled, a higher verbosity level will be set on the TensorRT
                logger. Default: False
            workspace (int): Max memory workspace to allow, in Gb. Default: 4
            precision (string): The datatype to use for the engine inference, either 'fp32',
                'fp16' or 'int8'. Default: 'fp32'
            enable_dynamic (bool): Whether to enable dynamic shapes. Default: False
            max_batch_size (int): Maximum batch size reserved for dynamic shape inference.
                Default: 16
            calib_input (string, optinal): The path to a directory holding the calibration images.
                Default: None
            calib_cache (string, optinal): The path where to write the calibration cache to,
                or if it already exists, load it from. Default: None
            calib_num_images (int): The maximum number of images to use for calibration. Default: 5000
            calib_batch_size (int): The batch size to use for the calibration process. Default: 8
        """
        self.logger = trt.Logger(trt.Logger.INFO)
        if verbose:
            self.logger.min_severity = trt.Logger.Severity.VERBOSE

        trt.init_libnvinfer_plugins(self.logger, namespace="")

        self.builder = trt.Builder(self.logger)
        self.config = self.builder.create_builder_config()
        self.config.max_workspace_size = workspace * 1 << 30

        self.batch_size = None
        self.network = None
        self.parser = None

        # Leaving some interfaces and parameters for subsequent use, but we have not yet
        # implemented the following functionality
        self.precision = precision
        self.enable_dynamic = enable_dynamic
        self.max_batch_size = max_batch_size
        self.calib_input = calib_input
        self.calib_cache = calib_cache
        self.calib_num_images = calib_num_images
        self.calib_batch_size = calib_batch_size
예제 #29
0
    def __init__(self, engine_path, preprocess_fn=bgr8_to_ssd_input):
        from .tensorrt_model import TRTModel
        from jnmouse.ssd_tensorrt import parse_boxes, TRT_INPUT_NAME, TRT_OUTPUT_NAME

        logger = trt.Logger()
        trt.init_libnvinfer_plugins(logger, '')
        load_plugins()
        self.trt_model = TRTModel(engine_path)
        ## If you want to specify input and output, use the following instead of the above.
        # self.trt_model = TRTModel(engine_path, input_names=[TRT_INPUT_NAME],
        #                           output_names=[TRT_OUTPUT_NAME, TRT_OUTPUT_NAME + '_1'])
        self.preprocess_fn = preprocess_fn
        self.postprocess_fn = parse_boxes
def run(nGroup, xWidth, yWidth, h, dim_t, datatype):
    print("test [%d,%d/%d,%d],dim_t=%d" % (nGroup, xWidth, yWidth, h, dim_t),
          datatype)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)
    weight = np.full([h, dim_t, h], 0.1, dtype=np.float32)
    engine = buildEngine(logger, [nGroup, max(xWidth, yWidth), h], dim_t,
                         weight, datatype)
    if engine == None:
        print("Failed building engine!")
        return None
    print("Succeed building engine!")

    context = engine.create_execution_context()
    context.set_binding_shape(0, [nGroup, xWidth, h])
    context.set_binding_shape(1, [nGroup, yWidth, h])
    #print("Binding0->",engine.get_binding_shape(0),context.get_binding_shape(0))
    #print("Binding1->",engine.get_binding_shape(1),context.get_binding_shape(1))
    #print("Binding2->",engine.get_binding_shape(2),context.get_binding_shape(2))
    #print("All bind:",context.all_binding_shapes_specified)
    stream = cuda.Stream()

    data0 = np.ones([nGroup, xWidth, h], dtype=datatype)
    data1 = np.ones([nGroup, yWidth, h], dtype=datatype)
    inputH0 = np.ascontiguousarray(data0)
    inputD0 = cuda.mem_alloc(inputH0.nbytes)
    inputH1 = np.ascontiguousarray(data1)
    inputD1 = cuda.mem_alloc(inputH1.nbytes)
    outputH0 = np.empty(context.get_binding_shape(2),
                        dtype=trt.nptype(engine.get_binding_dtype(2)))
    outputD0 = cuda.mem_alloc(outputH0.nbytes)

    cuda.memcpy_htod_async(inputD0, inputH0, stream)
    cuda.memcpy_htod_async(inputD1, inputH1, stream)
    context.execute_async_v2(
        [int(inputD0), int(inputD1), int(outputD0)], stream.handle)
    cuda.memcpy_dtoh_async(outputH0, outputD0, stream)

    stream.synchronize()

    outputH0CPU = MMTCPU(inputH0, inputH1, weight)

    #print("InputH0->",inputH0.shape, engine.get_binding_dtype(0))
    #print(inputH0)
    #print("InputH1->",inputH1.shape, engine.get_binding_dtype(1))
    #print(inputH1)
    #print("OutputH0->",outputH0.shape, engine.get_binding_dtype(2))
    #print(outputH0)
    print("Check result:",
          ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])