Exemplo n.º 1
0
def torch2trt_dynamic(module,
                      inputs,
                      input_names=None,
                      output_names=None,
                      log_level=trt.Logger.ERROR,
                      max_batch_size=1,
                      fp16_mode=False,
                      max_workspace_size=0,
                      opt_shape_param=None,
                      strict_type_constraints=False,
                      keep_network=True,
                      int8_mode=False,
                      int8_calib_dataset=None,
                      int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM):
    if int8_mode and fp16_mode:
        fp16_mode = False
    inputs_in = inputs

    # copy inputs to avoid modifications to source data
    inputs = [tensor.clone() for tensor in inputs]

    logger = trt.Logger(log_level)
    builder = trt.Builder(logger)
    EXPLICIT_BATCH = 1 << (int)(
        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(EXPLICIT_BATCH)

    with ShapeConverter(), ConversionContext(network) as ctx:

        if isinstance(inputs, list):
            inputs = tuple(inputs)
        if not isinstance(inputs, tuple):
            inputs = (inputs, )
        ctx.add_inputs(inputs, input_names, opt_shape_param)

        outputs = module(*inputs)

        if not isinstance(outputs, tuple) and not isinstance(outputs, list):
            outputs = (outputs, )
        ctx.mark_outputs(outputs, output_names)

        torch.cuda.empty_cache()

        builder.max_workspace_size = max_workspace_size
        builder.max_batch_size = max_batch_size
        builder.strict_type_constraints = strict_type_constraints

        config = builder.create_builder_config()
        config.max_workspace_size = max_workspace_size
        profile = builder.create_optimization_profile()

        if input_names is None:
            input_names = ['input_%d' % i for i in range(len(inputs))]
        for input_index, input_tensor in enumerate(inputs):
            if opt_shape_param is not None:
                min_shape = tuple(opt_shape_param[input_index][0][:])
                opt_shape = tuple(opt_shape_param[input_index][1][:])
                max_shape = tuple(opt_shape_param[input_index][2][:])
            else:
                opt_shape = tuple(input_tensor.shape)
                min_shape = opt_shape
                max_shape = opt_shape
            profile.set_shape(input_names[input_index], min_shape, opt_shape,
                              max_shape)
        config.add_optimization_profile(profile)

    if fp16_mode:
        builder.fp16_mode = fp16_mode
        config.set_flag(trt.BuilderFlag.FP16)

    if int8_mode:
        # default to use input tensors for calibration
        if int8_calib_dataset is None:
            int8_calib_dataset = TensorBatchDataset(inputs_in)

        config.set_flag(trt.BuilderFlag.INT8)
        config.int8_calibrator = DatasetCalibrator(
            input_names,
            profile,
            inputs_in,
            int8_calib_dataset,
            batch_size=opt_shape[0],
            algorithm=int8_calib_algorithm)
        config.set_calibration_profile(profile)
        builder.int8_mode = int8_mode
        builder.int8_calibrator = config.int8_calibrator

    engine = builder.build_engine(network, config)

    module_trt = TRTModule(engine, ctx.input_names, ctx.output_names)

    if keep_network:
        module_trt.network = network

    return module_trt
Exemplo n.º 2
0
    def from_onnx(
            onnx_path: Union[Path, str],
            save_path: Union[Path, str],
            inputs: List[IOShape],
            outputs: List[IOShape],
            int8_calibrator=None,
            create_model_config: bool = True,
            override: bool = False,
    ):
        """Takes an ONNX file and creates a TensorRT engine to run inference with
        From https://github.com/layerism/TensorRT-Inference-Server-Tutorial

        FIXME: bug exist: TRT 6.x.x does not support opset 10 used in ResNet50(ONNX).
        """
        import tensorrt as trt

        if save_path.with_suffix('.plan').exists():
            if not override:  # file exist yet override flag is not set
                logger.info('Use cached model')
                return True

        onnx_path = Path(onnx_path)
        assert onnx_path.exists()

        save_path = Path(save_path)
        # get arch name
        arch_name = parse_path(save_path)['architecture']

        # trt serving model repository is different from others:
        # `<model-name>/<framework>-tensorrt/<version>/model.plan`
        save_path = save_path.with_suffix('')
        save_path.mkdir(parents=True, exist_ok=True)

        # Save TRT engine
        trt_logger = trt.Logger(trt.Logger.WARNING)
        with trt.Builder(trt_logger) as builder:
            with builder.create_network(
                    1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network:
                with trt.OnnxParser(network, trt_logger) as parser:
                    builder.max_workspace_size = GiB(1)  # 1GB
                    builder.max_batch_size = 1
                    if int8_calibrator is not None:
                        builder.int8_mode = True
                        builder.int8_calibrator = int8_calibrator

                    print('Loading ONNX file from path {}...'.format(onnx_path))
                    with open(onnx_path, 'rb') as model:
                        parser.parse(model.read())
                    engine = builder.build_cuda_engine(network)

                    with open(save_path / 'model.plan', 'wb') as f:
                        f.write(engine.serialize())

        # create model configuration file
        if create_model_config:
            TRTConverter.generate_trt_config(
                save_path.parent,
                arch_name=arch_name,
                inputs=inputs,
                outputs=outputs
            )
        return True
Exemplo n.º 3
0
def FFDNet(
    clip: vs.VideoNode,
    sigma: float = 5.0,
    use_cuda_graph: bool = False,
    logger: trt.Logger = trt.Logger(trt.Logger.WARNING)
) -> vs.VideoNode:

    assert clip.format.id == vs.RGBS
    width, height = clip.width, clip.height

    sigma /= 255

    runtime = trt.Runtime(logger)

    with open(f"ffdnet_{width}_{height}.engine", "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())

    execution_context = engine.create_execution_context()
    input_size = execution_context.get_strides(0)[0] * 4
    input_shape = execution_context.get_binding_shape(0)
    sigma_size = execution_context.get_strides(1)[0] * 4
    sigma_shape = execution_context.get_binding_shape(1)
    output_size = execution_context.get_strides(2)[0] * 4
    output_shape = execution_context.get_binding_shape(2)

    h_sigma = checkError(
        cuda.cuMemHostAlloc(sigma_size, cuda.CU_MEMHOSTALLOC_WRITECOMBINED))
    h_sigma = UniqueResource(h_sigma, cuda.cuMemFreeHost, h_sigma)
    h_sigma_pointer = ctypes.cast(ctypes.c_void_p(h_sigma.obj),
                                  ctypes.POINTER(ctypes.c_float))
    h_sigma_array = np.ctypeslib.as_array(h_sigma_pointer,
                                          shape=(sigma_size //
                                                 4, )).reshape(sigma_shape)

    d_sigma = checkError(cuda.cuMemAlloc(sigma_size))
    d_sigma = UniqueResource(d_sigma, cuda.cuMemFree, d_sigma)

    h_input = checkError(
        cuda.cuMemHostAlloc(input_size, cuda.CU_MEMHOSTALLOC_WRITECOMBINED))
    h_input = UniqueResource(h_input, cuda.cuMemFreeHost, h_input)
    h_input_pointer = ctypes.cast(ctypes.c_void_p(h_input.obj),
                                  ctypes.POINTER(ctypes.c_float))
    h_input_array = np.ctypeslib.as_array(h_input_pointer,
                                          shape=(input_size //
                                                 4, )).reshape(input_shape)

    d_input = checkError(cuda.cuMemAlloc(input_size))
    d_input = UniqueResource(d_input, cuda.cuMemFree, d_input)

    d_output = checkError(cuda.cuMemAlloc(output_size))
    d_output = UniqueResource(d_output, cuda.cuMemFree, d_output)

    h_output = checkError(cuda.cuMemAllocHost(output_size))
    h_output = UniqueResource(h_output, cuda.cuMemFreeHost, h_output)
    h_output_pointer = ctypes.cast(ctypes.c_void_p(h_output.obj),
                                   ctypes.POINTER(ctypes.c_float))
    h_output_array = np.ctypeslib.as_array(h_output_pointer,
                                           shape=(output_size //
                                                  4, )).reshape(output_shape)

    stream = checkError(
        cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value))
    stream = UniqueResource(stream, cuda.cuStreamDestroy, stream)

    h_sigma_array[...] = sigma
    checkError(
        cuda.cuMemcpyHtoDAsync(d_sigma.obj, h_sigma.obj, sigma_size,
                               stream.obj))

    def execute():
        checkError(
            cuda.cuMemcpyHtoDAsync(d_input.obj, h_input.obj, input_size,
                                   stream.obj))

        execution_context.execute_async_v2(
            [d_input.obj, d_sigma.obj, d_output.obj], stream_handle=stream.obj)

        checkError(
            cuda.cuMemcpyDtoHAsync(h_output.obj, d_output.obj, output_size,
                                   stream.obj))

    if use_cuda_graph:
        checkError(
            cuda.cuStreamBeginCapture(
                stream.obj,
                cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED))

        execute()

        graph = checkError(cuda.cuStreamEndCapture(stream.obj))
        graphexec, error_node = checkError(
            cuda.cuGraphInstantiate(graph, logBuffer=b"", bufferSize=0))
        graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy,
                                   graphexec)
        checkError(cuda.cuGraphDestroy(graph))

    def inference_core(n, f):
        for i in range(3):
            h_input_array[0, i, :, :] = np.asarray(
                _get_array(f, plane=i, read=True))

        if use_cuda_graph:
            checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj))
        else:
            execute()

        fout = f.copy()
        fout.get_write_array(0)  # triggers COW
        checkError(cuda.cuStreamSynchronize(stream.obj))

        for i in range(3):
            np.asarray(_get_array(fout, plane=i,
                                  read=False))[...] = h_output_array[0,
                                                                     i, :, :]

        return fout

    return core.std.ModifyFrame(clip, clips=[clip], selector=inference_core)
Exemplo n.º 4
0
def run(args):
    onnx_filename = run_onnx_util.onnx_model_file(args.test_dir, args.model_file)
    input_names, output_names = run_onnx_util.onnx_input_output_names(
        onnx_filename)
    test_data_dir = os.path.join(args.test_dir, 'test_data_set_0')
    inputs, outputs = run_onnx_util.load_test_data(
        test_data_dir, input_names, output_names)

    with open(onnx_filename, 'rb') as f:
        onnx_proto = f.read()

    if args.debug:
        logger = tensorrt.Logger(tensorrt.Logger.Severity.INFO)
    else:
        logger = tensorrt.Logger()
    builder = tensorrt.Builder(logger)
    if args.fp16_mode:
        builder.fp16_mode = True
    # TODO(hamaji): Infer batch_size from inputs.
    builder.max_batch_size = args.batch_size
    network = builder.create_network()
    parser = tensorrt.OnnxParser(network, logger)
    if not parser.parse(onnx_proto):
        for i in range(parser.num_errors):
             sys.stderr.write('ONNX import failure: %s\n' % parser.get_error(i))
             raise RuntimeError('ONNX import failed')
    engine = builder.build_cuda_engine(network)
    context = engine.create_execution_context()

    assert len(inputs) + len(outputs) == engine.num_bindings
    for i, (_, input) in enumerate(inputs):
        assert args.batch_size == input.shape[0]
        assert input.shape[1:] == engine.get_binding_shape(i)
    for i, (_, output) in enumerate(outputs):
        assert args.batch_size == output.shape[0]
        i += len(inputs)
        assert output.shape[1:] == engine.get_binding_shape(i)

    inputs = [v for n, v in inputs]
    outputs = [v for n, v in outputs]
    gpu_inputs = to_gpu(inputs)
    gpu_outputs = []
    for output in outputs:
        gpu_outputs.append(cupy.zeros_like(cupy.array(output)))
    bindings = [a.data.ptr for a in gpu_inputs]
    bindings += [a.data.ptr for a in gpu_outputs]

    context.execute(args.batch_size, bindings)

    actual_outputs = to_cpu(gpu_outputs)

    for i, (name, expected, actual) in enumerate(
            zip(output_names, outputs, actual_outputs)):
        np.testing.assert_allclose(expected, actual,
                                   rtol=args.rtol, atol=args.atol), name
        print('%s: OK' % name)
    print('ALL OK')

    def compute():
        context.execute(args.batch_size, bindings)
        cupy.cuda.device.Device().synchronize()

    return run_onnx_util.run_benchmark(compute, args.iterations)
Exemplo n.º 5
0
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

onnx_file = 'mobilenet.onnx'
trt_file = 'mobilenet.trt'
batch_size = 1
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
        EXPLICIT_BATCH) as network, trt.OnnxParser(network,
                                                   TRT_LOGGER) as parser:
    builder.max_workspace_size = 1 << 28  # 256MiB
    builder.max_batch_size = batch_size
    builder.fp16_mode = True  # fp32_mode -> False
    # Parse model file
    with open(onnx_file, 'rb') as model:
        print('Beginning ONNX file parsing')
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
    print('Completed parsing of ONNX file')
    engine = builder.build_cuda_engine(network)
    print("Completed creating Engine")
    with open(trt_file, "wb") as f:
        f.write(engine.serialize())
Exemplo n.º 6
0
# tensorrt-lib

import os

import tensorrt as trt

from calibrator import Calibrator

# add verbose
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)  # ** engine���ӻ� **
f_layer = open('log.txt', 'w')


# create tensorrt-engine
# fixed and dynamic
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \
               fp32_mode=False, fp16_mode=False, int4_mode=False, calibration_stream=None, calibration_table_path="",
               save_engine=False, strategy=None):
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
    def build_engine(max_batch_size, save_engine):
        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
        # 1 << NetworkDefinitionCreationFlag.EXPLICIT_BATCH
        with trt.Builder(TRT_LOGGER) as builder, \
                builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
                trt.OnnxParser(network, TRT_LOGGER) as parser:

            # parse onnx model file
            if not os.path.exists(onnx_file_path):
                quit('ONNX file {} not found'.format(onnx_file_path))
            print('Loading ONNX file from path {}...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model:
Exemplo n.º 7
0
import tensorrt as trt

# Input parameters specific to the trained model
uff_file_name = 'saved_model.uff'  # Name of uff file that defines the trained model
input_node_name = 'input/IteratorGetNext'  # Input node (best to name it with tf.name.scope)
input_node_dims = (1, 1, 4096)  # Input dimensions to trained model

# Input parameter for inference
batch_size = 128  # Batch size to optimize to. This should be used for inference
workspace_size = 1073741824  # 1 GB, for example
use_fp16 = True  # Do you want to use float16 type
output_file_name = 'saved_model.plan'  # Name of output file

# Make the plan file
builder = trt.Builder(trt.Logger(trt.Logger.INFO))
network = builder.create_network()

parser = trt.UffParser()
parser.register_input(input_node_name, input_node_dims)
parser.parse(uff_file_name, network)

builder.max_batch_size = batch_size
builder.max_workspace_size = workspace_size
builder.fp16_mode = use_fp16

engine = builder.build_cuda_engine(network)

with open(output_file_name, 'wb') as f:
     f.write(engine.serialize())
Exemplo n.º 8
0
    def __init__(self,
                 weights='yolov5s.pt',
                 device=torch.device('cpu'),
                 dnn=False,
                 data=None,
                 fp16=False):
        # Usage:
        #   PyTorch:              weights = *.pt
        #   TorchScript:                    *.torchscript
        #   ONNX Runtime:                   *.onnx
        #   ONNX OpenCV DNN:                *.onnx with --dnn
        #   OpenVINO:                       *.xml
        #   CoreML:                         *.mlmodel
        #   TensorRT:                       *.engine
        #   TensorFlow SavedModel:          *_saved_model
        #   TensorFlow GraphDef:            *.pb
        #   TensorFlow Lite:                *.tflite
        #   TensorFlow Edge TPU:            *_edgetpu.tflite
        from models.experimental import attempt_download, attempt_load  # scoped to avoid circular import

        super().__init__()
        w = str(weights[0] if isinstance(weights, list) else weights)
        pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs = self.model_type(
            w)  # get backend
        stride, names = 32, [f'class{i}'
                             for i in range(1000)]  # assign defaults
        w = attempt_download(w)  # download if not local
        fp16 &= (pt or jit or onnx or engine) and device.type != 'cpu'  # FP16
        if data:  # data.yaml path (optional)
            with open(data, errors='ignore') as f:
                names = yaml.safe_load(f)['names']  # class names

        if pt:  # PyTorch
            model = attempt_load(weights if isinstance(weights, list) else w,
                                 map_location=device)
            stride = max(int(model.stride.max()), 32)  # model stride
            names = model.module.names if hasattr(
                model, 'module') else model.names  # get class names
            model.half() if fp16 else model.float()
            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
        elif jit:  # TorchScript
            LOGGER.info(f'Loading {w} for TorchScript inference...')
            extra_files = {'config.txt': ''}  # model metadata
            model = torch.jit.load(w, _extra_files=extra_files)
            model.half() if fp16 else model.float()
            if extra_files['config.txt']:
                d = json.loads(extra_files['config.txt'])  # extra_files dict
                stride, names = int(d['stride']), d['names']
        elif dnn:  # ONNX OpenCV DNN
            LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...')
            check_requirements(('opencv-python>=4.5.4', ))
            net = cv2.dnn.readNetFromONNX(w)
        elif onnx:  # ONNX Runtime
            LOGGER.info(f'Loading {w} for ONNX Runtime inference...')
            cuda = torch.cuda.is_available()
            check_requirements(
                ('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime'))
            import onnxruntime
            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'
                         ] if cuda else ['CPUExecutionProvider']
            session = onnxruntime.InferenceSession(w, providers=providers)
            meta = session.get_modelmeta().custom_metadata_map  # metadata
            if 'stride' in meta:
                stride, names = int(meta['stride']), eval(meta['names'])
        elif xml:  # OpenVINO
            LOGGER.info(f'Loading {w} for OpenVINO inference...')
            check_requirements(
                ('openvino-dev', )
            )  # requires openvino-dev: https://pypi.org/project/openvino-dev/
            import openvino.inference_engine as ie
            core = ie.IECore()
            if not Path(w).is_file():  # if not *.xml
                w = next(Path(w).glob(
                    '*.xml'))  # get *.xml file from *_openvino_model dir
            network = core.read_network(
                model=w,
                weights=Path(w).with_suffix('.bin'))  # *.xml, *.bin paths
            executable_network = core.load_network(network,
                                                   device_name='CPU',
                                                   num_requests=1)
        elif engine:  # TensorRT
            LOGGER.info(f'Loading {w} for TensorRT inference...')
            import tensorrt as trt  # https://developer.nvidia.com/nvidia-tensorrt-download
            check_version(trt.__version__, '7.0.0',
                          hard=True)  # require tensorrt>=7.0.0
            Binding = namedtuple('Binding',
                                 ('name', 'dtype', 'shape', 'data', 'ptr'))
            logger = trt.Logger(trt.Logger.INFO)
            with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
                model = runtime.deserialize_cuda_engine(f.read())
            bindings = OrderedDict()
            fp16 = False  # default updated below
            for index in range(model.num_bindings):
                name = model.get_binding_name(index)
                dtype = trt.nptype(model.get_binding_dtype(index))
                shape = tuple(model.get_binding_shape(index))
                data = torch.from_numpy(np.empty(
                    shape, dtype=np.dtype(dtype))).to(device)
                bindings[name] = Binding(name, dtype, shape, data,
                                         int(data.data_ptr()))
                if model.binding_is_input(index) and dtype == np.float16:
                    fp16 = True
            binding_addrs = OrderedDict(
                (n, d.ptr) for n, d in bindings.items())
            context = model.create_execution_context()
            batch_size = bindings['images'].shape[0]
        elif coreml:  # CoreML
            LOGGER.info(f'Loading {w} for CoreML inference...')
            import coremltools as ct
            model = ct.models.MLModel(w)
        else:  # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
            if saved_model:  # SavedModel
                LOGGER.info(
                    f'Loading {w} for TensorFlow SavedModel inference...')
                import tensorflow as tf
                keras = False  # assume TF1 saved_model
                model = tf.keras.models.load_model(
                    w) if keras else tf.saved_model.load(w)
            elif pb:  # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
                LOGGER.info(
                    f'Loading {w} for TensorFlow GraphDef inference...')
                import tensorflow as tf

                def wrap_frozen_graph(gd, inputs, outputs):
                    x = tf.compat.v1.wrap_function(
                        lambda: tf.compat.v1.import_graph_def(gd, name=""),
                        [])  # wrapped
                    ge = x.graph.as_graph_element
                    return x.prune(tf.nest.map_structure(ge, inputs),
                                   tf.nest.map_structure(ge, outputs))

                gd = tf.Graph().as_graph_def()  # graph_def
                with open(w, 'rb') as f:
                    gd.ParseFromString(f.read())
                frozen_func = wrap_frozen_graph(gd,
                                                inputs="x:0",
                                                outputs="Identity:0")
            elif tflite or edgetpu:  # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python
                try:  # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
                    from tflite_runtime.interpreter import Interpreter, load_delegate
                except ImportError:
                    import tensorflow as tf
                    Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate,
                if edgetpu:  # Edge TPU https://coral.ai/software/#edgetpu-runtime
                    LOGGER.info(
                        f'Loading {w} for TensorFlow Lite Edge TPU inference...'
                    )
                    delegate = {
                        'Linux': 'libedgetpu.so.1',
                        'Darwin': 'libedgetpu.1.dylib',
                        'Windows': 'edgetpu.dll'
                    }[platform.system()]
                    interpreter = Interpreter(
                        model_path=w,
                        experimental_delegates=[load_delegate(delegate)])
                else:  # Lite
                    LOGGER.info(
                        f'Loading {w} for TensorFlow Lite inference...')
                    interpreter = Interpreter(
                        model_path=w)  # load TFLite model
                interpreter.allocate_tensors()  # allocate
                input_details = interpreter.get_input_details()  # inputs
                output_details = interpreter.get_output_details()  # outputs
            elif tfjs:
                raise Exception(
                    'ERROR: YOLOv5 TF.js inference is not supported')
        self.__dict__.update(locals())  # assign all variables to self
Exemplo n.º 9
0
# limitations under the License.
#

import ctypes
import numpy as np
from cuda import cudart
import tensorrt as trt

np.random.seed(97)
nIn, cIn, hIn, wIn = 4, 3, 128, 128
data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape(nIn, cIn, hIn, wIn)

np.set_printoptions(precision=8, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()

logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
profile0 = builder.create_optimization_profile()
profile1 = builder.create_optimization_profile()
config = builder.create_builder_config()
config.max_workspace_size = 1 << 30

inputT0 = network.add_input('inputT0', trt.float32, [-1, cIn, hIn, wIn])
layer = network.add_unary(inputT0, trt.UnaryOperation.NEG)
network.mark_output(layer.get_output(0))

profile0.set_shape(inputT0.name, (1, cIn, hIn, wIn), (nIn, cIn, hIn, wIn), (nIn * 2, cIn, hIn, wIn))
profile1.set_shape(inputT0.name, (1, cIn, hIn, wIn), (nIn, cIn, hIn, wIn), (nIn * 2, cIn, hIn, wIn))
config.add_optimization_profile(profile0)
config.add_optimization_profile(profile1)
Exemplo n.º 10
0
def torch2trt(module,
              inputs,
              input_names=None,
              output_names=None,
              log_level=trt.Logger.ERROR,
              max_batch_size=1,
              fp16_mode=False,
              max_workspace_size=1 << 25,
              strict_type_constraints=False,
              keep_network=True,
              int8_mode=False,
              int8_calib_dataset=None,
              int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM,
              int8_calib_batch_size=1,
              use_onnx=False,
              **kwargs):

    # capture arguments to provide to context
    kwargs.update(locals())
    kwargs.pop('kwargs')

    inputs_in = inputs

    # copy inputs to avoid modifications to source data
    inputs = [tensor.clone() for tensor in inputs]  # only run single entry

    logger = trt.Logger(log_level)
    builder = trt.Builder(logger)

    if isinstance(inputs, list):
        inputs = tuple(inputs)
    if not isinstance(inputs, tuple):
        inputs = (inputs, )

    # run once to get num outputs
    outputs = module(*inputs)
    if not isinstance(outputs, tuple) and not isinstance(outputs, list):
        outputs = (outputs, )

    if input_names is None:
        input_names = default_input_names(len(inputs))
    if output_names is None:
        output_names = default_output_names(len(outputs))

    if use_onnx:

        f = io.BytesIO()
        torch.onnx.export(module,
                          inputs,
                          f,
                          input_names=input_names,
                          output_names=output_names)
        f.seek(0)
        onnx_bytes = f.read()
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        parser = trt.OnnxParser(network, logger)
        parser.parse(onnx_bytes)

    else:
        network = builder.create_network()
        with ConversionContext(network, torch2trt_kwargs=kwargs) as ctx:

            ctx.add_inputs(inputs, input_names)

            outputs = module(*inputs)

            if not isinstance(outputs, tuple) and not isinstance(
                    outputs, list):
                outputs = (outputs, )
            ctx.mark_outputs(outputs, output_names)

    builder.max_workspace_size = max_workspace_size
    builder.fp16_mode = fp16_mode
    builder.max_batch_size = max_batch_size
    builder.strict_type_constraints = strict_type_constraints

    if int8_mode:

        # default to use input tensors for calibration
        if int8_calib_dataset is None:
            int8_calib_dataset = TensorBatchDataset(inputs_in)

        builder.int8_mode = True

        # @TODO(jwelsh):  Should we set batch_size=max_batch_size?  Need to investigate memory consumption
        builder.int8_calibrator = DatasetCalibrator(
            inputs,
            int8_calib_dataset,
            batch_size=int8_calib_batch_size,
            algorithm=int8_calib_algorithm)

    engine = builder.build_cuda_engine(network)

    module_trt = TRTModule(engine, input_names, output_names)

    if keep_network:
        module_trt.network = network

    return module_trt
Exemplo n.º 11
0
dataPath = os.path.dirname(os.path.realpath(__file__)) + "/../../00-MNISTData/"
sys.path.append(dataPath)

np.random.seed(97)
nTrainbatchSize = 128
pbFile = "./model-NCHW.pb"
caffePrototxtFile = "./model.prototxt"
caffeModelFile = "./model.caffemodel"
trtFile = "./model-NCHW.plan"
inputImage = dataPath + '8.png'

np.set_printoptions(precision=4, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()

# TensorRT 中加载 Caffe 模型并创建 engine -----------------------------------------
logger = trt.Logger(trt.Logger.VERBOSE)
if os.path.isfile(trtFile):
    with open(trtFile, 'rb') as f:
        engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
    if engine == None:
        print("Failed loading engine!")
        exit()
    print("Succeeded loading engine!")
else:
    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    config = builder.create_builder_config()
    config.max_workspace_size = 3 << 30
    parser = trt.CaffeParser()
    with open(caffePrototxtFile, 'rb') as f0, open(caffeModelFile, 'rb') as f1:
        net = parser.parse_buffer(f0.read(), f1.read(), network, trt.float32)
Exemplo n.º 12
0
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


anchors = []
fpn_fm_shape = [
    math.ceil(cfg.img_size / stride) for stride in (8, 16, 32, 64, 128)
]
for i, size in enumerate(fpn_fm_shape):
    anchors += make_anchors(cfg, size, size, cfg.scales[i])

# prepare engine
with open(cfg.weight,
          'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()

    for binding in engine:
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))

        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
Exemplo n.º 13
0
def load_engine(engine_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)  # INFO
    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())
import pycuda.driver as cuda
import pycuda.autoinit
import utilities as ut

USE_FP16 = True
target_dtype = np.float16 if USE_FP16 else np.float32

# input_batch = ut.random_image(100).numpy()
img_path = "data/test7.jpg"
# input_batch = ut.npz_loader(img_path).numpy()
input_batch = ut.load_image(img_path).unsqueeze(0).numpy()
print(input_batch.shape)
input_batch = np.ascontiguousarray(input_batch, dtype=np.float16)
print(input_batch)
f = open("inference_models/edsr.trt", "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

# need to set input and output precisions to FP16 to fully enable it
output = np.empty([1, 3, 480, 480], dtype=target_dtype)

# allocate device memory
d_input = cuda.mem_alloc(1 * input_batch.nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)

bindings = [int(d_input), int(d_output)]

stream = cuda.Stream()

Exemplo n.º 15
0
def export_engine(model,
                  im,
                  file,
                  train,
                  half,
                  simplify,
                  workspace=4,
                  verbose=False,
                  prefix=colorstr('TensorRT:')):
    # YOLOv5 TensorRT export https://developer.nvidia.com/tensorrt
    try:
        assert im.device.type != 'cpu', 'export running on CPU but must be on GPU, i.e. `python export.py --device 0`'
        try:
            import tensorrt as trt
        except Exception:
            if platform.system() == 'Linux':
                check_requirements(
                    ('nvidia-tensorrt', ),
                    cmds=('-U --index-url https://pypi.ngc.nvidia.com', ))
            import tensorrt as trt

        if trt.__version__[
                0] == '7':  # TensorRT 7 handling https://github.com/ultralytics/yolov5/issues/6012
            grid = model.model[-1].anchor_grid
            model.model[-1].anchor_grid = [a[..., :1, :1, :] for a in grid]
            export_onnx(model, im, file, 12, train, False,
                        simplify)  # opset 12
            model.model[-1].anchor_grid = grid
        else:  # TensorRT >= 8
            check_version(trt.__version__, '8.0.0',
                          hard=True)  # require tensorrt>=8.0.0
            export_onnx(model, im, file, 13, train, False,
                        simplify)  # opset 13
        onnx = file.with_suffix('.onnx')

        LOGGER.info(
            f'\n{prefix} starting export with TensorRT {trt.__version__}...')
        assert onnx.exists(), f'failed to export ONNX file: {onnx}'
        f = file.with_suffix('.engine')  # TensorRT engine file
        logger = trt.Logger(trt.Logger.INFO)
        if verbose:
            logger.min_severity = trt.Logger.Severity.VERBOSE

        builder = trt.Builder(logger)
        config = builder.create_builder_config()
        config.max_workspace_size = workspace * 1 << 30
        # config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace << 30)  # fix TRT 8.4 deprecation notice

        flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        network = builder.create_network(flag)
        parser = trt.OnnxParser(network, logger)
        if not parser.parse_from_file(str(onnx)):
            raise RuntimeError(f'failed to load ONNX file: {onnx}')

        inputs = [network.get_input(i) for i in range(network.num_inputs)]
        outputs = [network.get_output(i) for i in range(network.num_outputs)]
        LOGGER.info(f'{prefix} Network Description:')
        for inp in inputs:
            LOGGER.info(
                f'{prefix}\tinput "{inp.name}" with shape {inp.shape} and dtype {inp.dtype}'
            )
        for out in outputs:
            LOGGER.info(
                f'{prefix}\toutput "{out.name}" with shape {out.shape} and dtype {out.dtype}'
            )

        LOGGER.info(
            f'{prefix} building FP{16 if builder.platform_has_fast_fp16 and half else 32} engine in {f}'
        )
        if builder.platform_has_fast_fp16 and half:
            config.set_flag(trt.BuilderFlag.FP16)
        with builder.build_engine(network, config) as engine, open(f,
                                                                   'wb') as t:
            t.write(engine.serialize())
        LOGGER.info(
            f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
        return f
    except Exception as e:
        LOGGER.info(f'\n{prefix} export failure: {e}')
Exemplo n.º 16
0
class InferenceBackend:
    # initialize TensorRT
    TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    def __init__(self, model, batch_size):
        self.model = model
        self.batch_size = batch_size

        # load plugin if the model requires one
        if self.model.PLUGIN_PATH is not None:
            try:
                ctypes.cdll.LoadLibrary(self.model.PLUGIN_PATH)
            except OSError as err:
                raise RuntimeError('Plugin not found') from err

        # load trt engine or build one if not found
        if not self.model.ENGINE_PATH.exists():
            self.engine = self.model.build_engine(InferenceBackend.TRT_LOGGER,
                                                  self.batch_size)
        else:
            runtime = trt.Runtime(InferenceBackend.TRT_LOGGER)
            with open(self.model.ENGINE_PATH, 'rb') as engine_file:
                buf = engine_file.read()
                self.engine = runtime.deserialize_cuda_engine(buf)
        if self.engine is None:
            raise RuntimeError('Unable to load the engine file')
        if self.engine.has_implicit_batch_dimension:
            assert self.batch_size <= self.engine.max_batch_size

        # allocate buffers
        self.bindings = []
        self.outputs = []
        for binding in self.engine:
            shape = self.engine.get_binding_shape(binding)
            size = trt.volume(shape)
            if self.engine.has_implicit_batch_dimension:
                size *= self.batch_size
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
            # allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # append the device buffer to device bindings
            self.bindings.append(int(device_mem))
            if self.engine.binding_is_input(binding):
                if not self.engine.has_implicit_batch_dimension:
                    assert self.batch_size == shape[0]
                self.input = HostDeviceMem(host_mem, device_mem)
            else:
                self.outputs.append(HostDeviceMem(host_mem, device_mem))
        self.context = self.engine.create_execution_context()
        self.stream = cuda.Stream()

    def infer(self):
        self.infer_async()
        return self.synchronize()

    def infer_async(self):
        cuda.memcpy_htod_async(self.input.device, self.input.host, self.stream)
        if self.engine.has_implicit_batch_dimension:
            self.context.execute_async(batch_size=self.batch_size,
                                       bindings=self.bindings,
                                       stream_handle=self.stream.handle)
        else:
            self.context.execute_async_v2(bindings=self.bindings,
                                          stream_handle=self.stream.handle)
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)

    def synchronize(self):
        self.stream.synchronize()
        return [out.host for out in self.outputs]
Exemplo n.º 17
0
import os
import tensorrt as trt
import logging
import uff
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import sys

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)  # global trt logger setting


class TensorrtBuilder:
    @staticmethod
    def _item_to_list(item):
        if not isinstance(item, list):
            if item:
                item = [item]
        return item

    @staticmethod
    def _GiB(val):
        return val * 1 << 30

    @staticmethod
Exemplo n.º 18
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])
    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    measurements_all = {"pre_processing": [],
                        "tacotron2_encoder_time": [],
                        "tacotron2_decoder_time": [],
                        "tacotron2_postnet_time": [],
                        "tacotron2_latency": [],
                        "waveglow_latency": [],
                        "latency": [],
                        "type_conversion": [],
                        "data_transfer": [],
                        "storage": [],
                        "tacotron2_items_per_sec": [],
                        "waveglow_items_per_sec": [],
                        "num_mels_per_audio": [],
                        "throughput": []}

    print("args:", args, unknown_args)

    torch.cuda.init()

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    encoder = load_engine(args.encoder, TRT_LOGGER)
    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
    postnet = load_engine(args.postnet, TRT_LOGGER)
    waveglow = load_engine(args.waveglow, TRT_LOGGER)

    if args.waveglow_ckpt != "":
        # setup denoiser using WaveGlow PyTorch checkpoint
        waveglow_ckpt = load_and_setup_model('WaveGlow', parser,
                                             args.waveglow_ckpt,
                                             fp16_run=args.fp16,
                                             cpu_run=False,
                                             forward_is_infer=True)
        denoiser = Denoiser(waveglow_ckpt).cuda()
        # after initialization, we don't need WaveGlow PyTorch checkpoint
        # anymore - deleting
        del waveglow_ckpt
        torch.cuda.empty_cache()

    # create TRT contexts for each engine
    encoder_context = encoder.create_execution_context()
    decoder_context = decoder_iter.create_execution_context()
    postnet_context = postnet.create_execution_context()
    waveglow_context = waveglow.create_execution_context()


    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
    texts = [texts[0][:args.input_length]]
    texts = texts*args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing"):
            sequences_padded, input_lengths = prepare_input_sequence(texts)
            sequences_padded = sequences_padded.to(torch.int32)
            input_lengths = input_lengths.to(torch.int32)

        with torch.no_grad():
            with MeasureTime(measurements, "latency"):
                with MeasureTime(measurements, "tacotron2_latency"):
                    mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
                                                           encoder_context, decoder_context, postnet_context,
                                                           sequences_padded, input_lengths, measurements, args.fp16, True)

                with MeasureTime(measurements, "waveglow_latency"):
                    audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)
                    
        num_mels = mel.size(0)*mel.size(2)
        num_samples = audios.size(0)*audios.size(1)

        with MeasureTime(measurements, "type_conversion"):
            audios = audios.float()

        with MeasureTime(measurements, "data_transfer"):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage"):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_"+str(i)+".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i]*args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples/measurements['latency']

        if iter >= warmup_iters:
            for k,v in measurements.items():
                if k in measurements_all.keys():
                    measurements_all[k].append(v)
                    DLLogger.log(step=(iter-warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
import os
import numpy as np
from time import time
from cuda import cudart
import tensorrt as trt

trtFile = "./model.plan"
nIn, cIn, hIn, wIn = 1, 1, 28, 28
np.random.seed(97)
data = np.random.rand(nIn, cIn, hIn, wIn).astype(np.float32) * 2 - 1

np.set_printoptions(precision=3, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()

logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(logger)
network = builder.create_network(
    1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
profile = builder.create_optimization_profile()
config = builder.create_builder_config()
config.max_workspace_size = 6 << 30
config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS)
                          | 1 << int(trt.TacticSource.CUBLAS_LT)
                          | 1 << int(trt.TacticSource.CUDNN))
#config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS) | 1 << int(trt.TacticSource.CUBLAS_LT))

inputTensor = network.add_input('inputT0', trt.float32, [-1, 1, 28, 28])
profile.set_shape(inputTensor.name, [1, cIn, hIn, wIn], [nIn, cIn, hIn, wIn],
                  [nIn * 2, cIn, hIn, wIn])
config.add_optimization_profile(profile)
Exemplo n.º 20
0
def torch2trt(module,
              inputs,
              input_names=None,
              output_names=None,
              log_level=trt.Logger.ERROR,
              max_batch_size=1,
              fp16_mode=False,
              max_workspace_size=0,
              strict_type_constraints=False,
              keep_network=True,
              int8_mode=False,
              int8_calib_dataset=None,
              int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM):

    inputs_in = inputs

    # copy inputs to avoid modifications to source data
    inputs = [tensor.clone()[0:1]
              for tensor in inputs]  # only run single entry

    logger = trt.Logger(log_level)
    builder = trt.Builder(logger)
    network = builder.create_network()

    with ConversionContext(network) as ctx:

        if isinstance(inputs, list):
            inputs = tuple(inputs)
        if not isinstance(inputs, tuple):
            inputs = (inputs, )
        ctx.add_inputs(inputs, input_names)

        outputs = module(*inputs)

        if not isinstance(outputs, tuple) and not isinstance(outputs, list):
            outputs = (outputs, )
        ctx.mark_outputs(outputs, output_names)

    builder.max_workspace_size = max_workspace_size
    builder.fp16_mode = fp16_mode
    builder.max_batch_size = max_batch_size
    builder.strict_type_constraints = strict_type_constraints

    if int8_mode:

        # default to use input tensors for calibration
        if int8_calib_dataset is None:
            int8_calib_dataset = TensorBatchDataset(inputs_in)

        builder.int8_mode = True

        # @TODO(jwelsh):  Should we set batch_size=max_batch_size?  Need to investigate memory consumption
        builder.int8_calibrator = DatasetCalibrator(
            inputs,
            int8_calib_dataset,
            batch_size=1,
            algorithm=int8_calib_algorithm)

    engine = builder.build_cuda_engine(network)

    module_trt = TRTModule(engine, ctx.input_names, ctx.output_names)

    if keep_network:
        module_trt.network = network

    return module_trt
Exemplo n.º 21
0
def timing_engine(engine_file_path,
                  batch_size,
                  num_input_channels,
                  height,
                  width,
                  timing_loops=100):
    logger = tensorrt.Logger(tensorrt.Logger.ERROR)

    with open(engine_file_path,
              'rb') as fin, tensorrt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(fin.read())

    assert engine is not None, 'deserialize engine failed!'
    assert batch_size <= engine.max_batch_size

    print('Engine info:')
    print('\tmax batch size: ', engine.max_batch_size)
    print('\tmax workspace_size: ', engine.max_workspace_size)
    print('\tdevice memory_size: ', engine.device_memory_size)

    inputs, outputs, bindings, stream = allocate_buffers(engine, batch_size)

    input_data = numpy.random.rand(batch_size, num_input_channels, height,
                                   width).astype(dtype=numpy.float32,
                                                 order='C')
    inputs[0].host = input_data

    print('Start timing......')

    with engine.create_execution_context() as context:

        # warm up
        for i in range(10):
            [
                cuda.memcpy_htod_async(inp.device, inp.host, stream)
                for inp in inputs
            ]
            context.execute_async(batch_size=batch_size,
                                  bindings=bindings,
                                  stream_handle=stream.handle)
            [
                cuda.memcpy_dtoh_async(out.host, out.device, stream)
                for out in outputs
            ]
            stream.synchronize()

        time_start = time.time()
        for i in range(timing_loops):
            [
                cuda.memcpy_htod_async(inp.device, inp.host, stream)
                for inp in inputs
            ]
            context.execute_async(batch_size=batch_size,
                                  bindings=bindings,
                                  stream_handle=stream.handle)
            [
                cuda.memcpy_dtoh_async(out.host, out.device, stream)
                for out in outputs
            ]
            stream.synchronize()
        time_end = time.time()

        print(
            'Total time elapsed: %.04f ms.\n%.04f ms for each image (%.02f FPS)\n%.04f ms for each batch'
            % ((time_end - time_start) * 1000,
               (time_end - time_start) * 1000 / batch_size / timing_loops,
               batch_size * timing_loops / (time_end - time_start),
               (time_end - time_start) * 1000 / timing_loops))
Exemplo n.º 22
0
def loadEngine2TensorRT(filepath):
    G_LOGGER = trt.Logger(trt.Logger.WARNING)
    # 反序列化引擎
    with open(filepath, "rb") as f, trt.Runtime(G_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
        return engine
Exemplo n.º 23
0
def benchmark(
    width: int,
    height: int,
    iter: int = 5,
    use_cuda_graph: bool = False,
    logger: trt.Logger = trt.Logger(trt.Logger.VERBOSE)
) -> None:

    cuda_context = init_cuda()

    runtime = trt.Runtime(logger)

    with open(f"ffdnet_{width}_{height}.engine", "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())

    execution_context = engine.create_execution_context()

    _bindings = _get_bindings(execution_context, engine.num_bindings)
    bindings = [binding.obj for binding in _bindings]

    stream = checkError(
        cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value))
    stream = UniqueResource(stream, cuda.cuStreamDestroy, stream)

    start = checkError(
        cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT.value))
    start = UniqueResource(start, cuda.cuEventDestroy, start)

    end = checkError(
        cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT.value))
    end = UniqueResource(end, cuda.cuEventDestroy, end)

    def execute():
        execution_context.execute_async_v2(bindings, stream_handle=stream.obj)

    if use_cuda_graph:
        checkError(
            cuda.cuStreamBeginCapture(
                stream.obj,
                cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED))

        execute()

        graph = checkError(cuda.cuStreamEndCapture(stream.obj))
        graphexec, error_node = checkError(
            cuda.cuGraphInstantiate(graph, logBuffer=b"", bufferSize=0))
        graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy,
                                   graphexec)
        checkError(
            cuda.cuGraphDebugDotPrint(
                graph, b"ffdnet.dot", cuda.CUgraphDebugDot_flags.
                CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE.value))
        checkError(cuda.cuGraphDestroy(graph))

    for _ in range(iter):
        checkError(cuda.cuEventRecord(start.obj, stream.obj))

        if use_cuda_graph:
            checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj))
        else:
            execute()

        checkError(cuda.cuEventRecord(end.obj, stream.obj))
        checkError(cuda.cuEventSynchronize(end.obj))

        duration = checkError(cuda.cuEventElapsedTime(start.obj, end.obj))

        print(f"duration: {duration} ms")
def run(shape, scalar):
    testCase = "<shape=%s,scalar=%f>" % (shape, scalar)
    trtFile = "./model-Dim%s.plan" % str(len(shape))
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engineStr = f.read()
            engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr)
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30

        inputT0 = network.add_input('inputT0', trt.float32,
                                    [-1 for i in shape])
        profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape],
                          [32 for i in shape])
        config.add_optimization_profile(profile)

        pluginLayer = network.add_plugin_v2([inputT0],
                                            getAddScalarPlugin(scalar))
        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, shape)
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
    for i in range(nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(nInput + i),
                     dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = addScalarCPU(bufferH[:nInput], scalar)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
Exemplo n.º 25
0
import sys
import glob

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
#from PIL import ImageDraw

from data_processing import PreprocessYOLO, PostprocessYOLO
from utils import read_truths_args, multi_bbox_ious

import common
import time
t2 = time.time()
TRT_LOGGER = trt.Logger()

try:
    data_dir = os.environ['TESTDATADIR']
except KeyError:
    data_dir = '/tmp/dataset-nctu/clothes/clothes_test'


def get_engine(engine_file_path="clothes.trt"):
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())


input_HW = (416, 416)
output_shapes = [(1, 255, 13, 13), (1, 255, 26, 26), (1, 255, 52, 52)]
# tensorrt-lib

import os
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from calibrator import Calibrator
from torch.autograd import Variable
import torch
import numpy as np
import time
# add verbose
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)  # ** engine可视化 **


# create tensorrt-engine
# fixed and dynamic
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="",\
               fp16_mode=False, int8_mode=False, calibration_stream=None, calibration_table_path="", save_engine=False):
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
    def build_engine(max_batch_size, save_engine):
        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
        with trt.Builder(TRT_LOGGER) as builder, \
                builder.create_network(1) as network,\
                trt.OnnxParser(network, TRT_LOGGER) as parser:

            # parse onnx model file
            if not os.path.exists(onnx_file_path):
                quit('ONNX file {} not found'.format(onnx_file_path))
            print('Loading ONNX file from path {}...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model:
Exemplo n.º 27
0
import tensorrt as trt
import uff
from tensorrt import UffParser

G_LOGGER = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(G_LOGGER, '')

model_file = './mask_rcnn_nucleus_0080.uff'

output_nodes = ['mrcnn_detection', "mrcnn_mask/Sigmoid"]

trt_output_nodes = output_nodes

INPUT_NODE = "input_image"
INPUT_SIZE = [3, 1024, 1024]

with trt.Builder(G_LOGGER) as builder, builder.create_network(
) as network, trt.UffParser() as parser:
    parser.register_input(INPUT_NODE, INPUT_SIZE)
    parser.register_output(output_nodes[0])
    parser.register_output(output_nodes[1])
    parser.parse(model_file, network)

    builder.max_batch_size = 1
    builder.max_workspace_size = 1 << 28  # 256MiB

    engine = builder.build_cuda_engine(network)
    for binding in engine:
        print(engine.get_binding_shape(binding))
    with open("nucleus.engine", "wb") as f:
        f.write(engine.serialize())
Exemplo n.º 28
0
def build_engine(onnx_path,
                 cfg_file_path,
                 model_name,
                 category_num,
                 do_int8,
                 dla_core,
                 verbose=False):
    """Build a TensorRT engine from ONNX using the older API."""
    net_w, net_h = get_input_wh(cfg_file_path)

    print('Loading the ONNX file...')
    onnx_data = load_onnx(onnx_path)
    if onnx_data is None:
        return None

    TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger()
    EXPLICIT_BATCH = [] if trt.__version__[0] < '7' else \
        [1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)]
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
            *EXPLICIT_BATCH) as network, trt.OnnxParser(network,
                                                        TRT_LOGGER) as parser:
        if do_int8 and not builder.platform_has_fast_int8:
            raise RuntimeError('INT8 not supported on this platform')
        if not parser.parse(onnx_data):
            print('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
        network = set_net_batch(network, MAX_BATCH_SIZE)

        print('Adding yolo_layer plugins...')
        network = add_yolo_plugins(network, cfg_file_path, model_name,
                                   category_num, TRT_LOGGER)

        print('Building an engine.  This would take a while...')
        print('(Use "--verbose" or "-v" to enable verbose logging.)')
        if trt.__version__[0] < '7':  # older API: build_cuda_engine()
            if dla_core >= 0:
                raise RuntimeError('DLA core not supported by old API')
            builder.max_batch_size = MAX_BATCH_SIZE
            builder.max_workspace_size = 1 << 30
            builder.fp16_mode = True  # alternative: builder.platform_has_fast_fp16
            if do_int8:
                from calibrator import YOLOEntropyCalibrator
                builder.int8_mode = True
                builder.int8_calibrator = YOLOEntropyCalibrator(
                    'calib_images', (net_h, net_w),
                    'calib_%s.bin' % model_name)
            engine = builder.build_cuda_engine(network)
        else:  # new API: build_engine() with builder config
            builder.max_batch_size = MAX_BATCH_SIZE
            config = builder.create_builder_config()
            config.max_workspace_size = 1 << 30
            config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
            config.set_flag(trt.BuilderFlag.FP16)
            profile = builder.create_optimization_profile()
            profile.set_shape(
                '000_net',  # input tensor name
                (MAX_BATCH_SIZE, 3, net_h, net_w),  # min shape
                (MAX_BATCH_SIZE, 3, net_h, net_w),  # opt shape
                (MAX_BATCH_SIZE, 3, net_h, net_w))  # max shape
            config.add_optimization_profile(profile)
            if do_int8:
                from calibrator import YOLOEntropyCalibrator
                config.set_flag(trt.BuilderFlag.INT8)
                config.int8_calibrator = YOLOEntropyCalibrator(
                    'calib_images', (net_h, net_w),
                    'calib_%s.bin' % model_name)
                config.set_calibration_profile(profile)
            if dla_core >= 0:
                config.default_device_type = trt.DeviceType.DLA
                config.DLA_core = dla_core
                config.set_flag(trt.BuilderFlag.STRICT_TYPES)
                print('Using DLA core %d.' % dla_core)
            engine = builder.build_engine(network, config)

        if engine is not None:
            print('Completed creating engine.')
        return engine
import torch
import tensorrt as trt
from vgg16_397923af_trt import populate_network


def build_engine(weights):
    # flag implies the input batch is explicit. The input shape is {P * C * H * W}.
    flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(flag) as network:
        # Populate the network using weights from the PyTorch model.
        populate_network(network, weights)
        builder.max_workspace_size = 4 * 1 << 30
        config = builder.create_builder_config()
        return builder.build_engine(network, config)


vgg16_path = './vgg16-397923af.pth'
vgg16_weights = torch.load(vgg16_path, map_location='cpu')
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

# Do inference with TensorRT
with build_engine(vgg16_weights) as engine:
    # Build an engine, allocate buffers and create a stream.
    host_memory = engine.serialize()
    output_engine = 'vgg16-397923af_fp32.engine'
    print("===> Save %s\n" % output_engine)
    with open(output_engine, "wb") as f:
        f.write(engine.serialize())
Exemplo n.º 30
0
def run():
    logger = trt.Logger(trt.Logger.ERROR)                                       # 指定 Logger,可用等级:VERBOSE,INFO,WARNING,ERRROR,INTERNAL_ERROR
    if os.path.isfile(trtFile):                                                 # 如果有 .plan 文件则直接读取
        with open(trtFile, 'rb') as f:
            engineString = f.read()
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
        if engine == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
    else:                                                                       # 没有 .plan 文件,从头开始创建
        builder = trt.Builder(logger)                                           # 网络元信息,Builder/Network/BuilderConfig/Profile 相关
        builder.max_batch_size = 3
        builder.max_workspace_size = 1 << 30
        network = builder.create_network()

        inputTensor = network.add_input('inputT0', trt.float32, [4, 5])  # 指定输入张量

        identityLayer = network.add_identity(inputTensor)                       # 恒等变换
        network.mark_output(identityLayer.get_output(0))                        # 标记输出张量

        engine = builder.build_cuda_engine(network)
        if engine == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:                                          # 将序列化网络保存为 .plan 文件
            f.write(engine.serialize())
            print("Succeeded saving .plan file!")

    context = engine.create_execution_context()                                 # 创建 context(相当于 GPU 进程)
    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])  # 获取 engine 绑定信息
    nOutput = engine.num_bindings - nInput
    for i in range(nInput):
        print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
    for i in range(nInput,nInput+nOutput):
        print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5)              # 准备数据和 Host/Device 端内存
    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(np.empty((3, ) + tuple(context.get_binding_shape(i)), dtype=trt.nptype(engine.get_binding_dtype(i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):                                                     # 首先将 Host 数据拷贝到 Device 端
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute(3, bufferD)                                                 # 运行推理计算

    for i in range(nInput, nInput + nOutput):                                   # 将结果从 Device 端拷回 Host 端
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    for i in range(nInput + nOutput):
        print(engine.get_binding_name(i))
        print(bufferH[i].reshape((3, ) + tuple(context.get_binding_shape(i))))

    for b in bufferD:                                                           # 释放 Device 端内存
        cudart.cudaFree(b)