Exemplo n.º 1
0
 def test_loader_explicit_precision(self):
     builder, network, parser = network_from_onnx_path(
         ONNX_MODELS["identity"].path, explicit_precision=True)
     with builder, network, parser:
         assert not network.has_implicit_batch_dimension
         if mod.version(trt.__version__) < mod.version("8.0"):
             assert network.has_explicit_precision
Exemplo n.º 2
0
    def onnx_to_trt(self, output_fpath: str, input_fpath: str,
                    network_metadata: NetworkMetadata):
        """
        Converts ONNX file to TRT engine.
        Since TensorRT already supplies converter functions and scripts,
        a default implementation is already provided.

        Arg:
            output_fpath (str): File location of the generated ONNX file.
            input_fpath (str): Input file location of the generated ONNX file.
            network_metadata (NetworkMetadata): Network metadata of the network being converted.

        Returns:
            TRTEngineFile: Newly generated engine.
        """
        result = self.trt_engine_class(output_fpath, network_metadata)
        self.trt_inference_config = CreateConfig(
            fp16=network_metadata.precision.fp16,
            max_workspace_size=result.DEFAULT_TRT_WORKSPACE_MB * 1024 * 1024,
            profiles=result.get_dynamic_shape_profiles(),
            strict_types=result.use_strict_types())

        g_logger_verbosity = (PG_LOGGER.EXTRA_VERBOSE if G_LOGGER.level
                              == G_LOGGER.DEBUG else PG_LOGGER.WARNING)
        with PG_LOGGER.verbosity(g_logger_verbosity):
            network_definition = result.get_network_definition(
                network_from_onnx_path(input_fpath))

            trt_engine = engine_from_network(network_definition,
                                             config=self.trt_inference_config)
            save_engine(trt_engine, output_fpath)

        return result
Exemplo n.º 3
0
def main():
    # In Polygraphy, loaders and runners take ownership of objects if they are provided
    # via the return values of callables. For example, we don't need to worry about object
    # lifetimes when we use lazy loaders.
    #
    # Since we are immediately evaluating, we take ownership of objects, and are responsible for freeing them.
    builder, network, parser = network_from_onnx_path("identity.onnx")

    # Extend the network with an identity layer.
    prev_output = network.get_output(0)
    network.unmark_output(prev_output)
    output = network.add_identity(prev_output).get_output(0)
    output.name = "output"
    network.mark_output(output)

    # Create a TensorRT IBuilderConfig so that we can build the engine with FP16 enabled.
    config = create_config(builder, network, fp16=True)

    # We can free everything we constructed above once we're done building the engine.
    # NOTE: In TensorRT 8.0, we do *not* need to use a context manager here.
    with builder, network, parser, config:
        engine = engine_from_network((builder, network), config)

    # NOTE: In TensorRT 8.0, we do *not* need to use a context manager to free `engine`.
    with engine, TrtRunner(engine) as runner:
        inp_data = np.ones((1, 1, 2, 2), dtype=np.float32)

        # NOTE: The runner owns the output buffers and is free to reuse them between `infer()` calls.
        # Thus, if you want to store results from multiple inferences, you should use `copy.deepcopy()`.
        outputs = runner.infer(feed_dict={"x": inp_data})

        assert np.array_equal(outputs["output"], inp_data)  # It's an identity model!

        print("Inference succeeded!")
Exemplo n.º 4
0
    def test_load_serialized_engine(self, engine_loader_args):
        with tempfile.NamedTemporaryFile() as f, engine_bytes_from_network(
                network_from_onnx_path(
                    ONNX_MODELS["identity"].path)) as engine_bytes:
            f.write(engine_bytes)
            f.flush()

            engine_loader_args.parse_args([f.name, "--model-type=engine"])
            with engine_loader_args.load_serialized_engine() as engine:
                assert isinstance(engine, trt.ICudaEngine)
                assert len(engine) == 2
                assert engine[0] == "x"
                assert engine[1] == "y"
Exemplo n.º 5
0
 def test_loader(self):
     builder, network, parser = network_from_onnx_path(
         ONNX_MODELS["identity"].path)
     with builder, network, parser:
         assert not network.has_implicit_batch_dimension
         assert not network.has_explicit_precision
Exemplo n.º 6
0
def main():
    # A Profile maps each input tensor to a range of shapes.
    # The `add()` method can be used to add shapes for a single input.
    #
    # TIP: To save lines, calls to `add` can be chained:
    #     profile.add("input0", ...).add("input1", ...)
    #
    #   Of course, you may alternatively write this as:
    #     profile.add("input0", ...)
    #     profile.add("input1", ...)
    #
    profiles = [
        # The low-latency case. For best performance, min == opt == max.
        Profile().add("X", min=(1, 3, 28, 28), opt=(1, 3, 28, 28), max=(1, 3, 28, 28)),
        # The dynamic batching case. We use `4` for the opt batch size since that's our most common case.
        Profile().add("X", min=(1, 3, 28, 28), opt=(4, 3, 28, 28), max=(32, 3, 28, 28)),
        # The offline case. For best performance, min == opt == max.
        Profile().add("X", min=(128, 3, 28, 28), opt=(128, 3, 28, 28), max=(128, 3, 28, 28)),
    ]

    # See examples/api/06_immediate_eval_api for details on immediately evaluated functional loaders like `engine_from_network`.
    # Note that we can freely inter-mix lazy and immediately-evaluated loaders.
    engine = engine_from_network(
        network_from_onnx_path("dynamic_identity.onnx"), config=CreateConfig(profiles=profiles)
    )

    # We'll save the engine so that we can inspect it with `inspect model`.
    # This should make it easy to see how the engine bindings are laid out.
    save_engine(engine, "dynamic_identity.engine")

    # We'll create, but not activate, three separate runners, each with a separate context.
    #
    # TIP: By providing a context directly, as opposed to via a lazy loader,
    # we can ensure that the runner will *not* take ownership of it.
    #
    low_latency = TrtRunner(engine.create_execution_context())

    # NOTE: The following two lines will cause TensorRT to display errors since profile 0
    # is already in use by the first execution context. We'll suppress them using G_LOGGER.verbosity().
    #
    with G_LOGGER.verbosity(G_LOGGER.CRITICAL):
        dynamic_batching = TrtRunner(engine.create_execution_context())
        offline = TrtRunner(engine.create_execution_context())
        # NOTE: We could update the profile index here (e.g. `context.active_optimization_profile = 2`),
        # but instead, we'll use TrtRunner's `set_profile()` API when we later activate the runner.

    # Finally, we can activate the runners as we need them.
    #
    # NOTE: Since the context and engine are already created, the runner will only need to
    # allocate input and output buffers during activation.

    input_img = np.ones((1, 3, 28, 28), dtype=np.float32)  # An input "image"

    with low_latency:
        outputs = low_latency.infer({"X": input_img})
        assert np.array_equal(outputs["Y"], input_img)  # It's an identity model!

        print("Low latency runner succeeded!")

        # While we're serving requests online, we might decide that we need dynamic batching
        # for a moment.
        #
        # NOTE: We're assuming that activating runners will be cheap here, so we can bring up
        # the dynamic batching runner just-in-time.
        #
        # TIP: If activating the runner is not cheap (e.g. input/output buffers are large),
        # it might be better to keep the runner active the whole time.
        #
        with dynamic_batching:
            # NOTE: The very first time we activate this runner, we need to set
            # the profile index (it's 0 by default). We need to do this *only once*.
            # Alternatively, we could have set the profile index in the context directly (see above).
            #
            dynamic_batching.set_profile(1)  # Use the second profile, which is intended for dynamic batching.

            # We'll create fake batches by repeating our fake input image.
            small_input_batch = np.repeat(input_img, 4, axis=0)  # Shape: (4, 3, 28, 28)
            outputs = dynamic_batching.infer({"X": small_input_batch})
            assert np.array_equal(outputs["Y"], small_input_batch)

    # If we need dynamic batching again later, we can activate the runner once more.
    #
    # NOTE: This time, we do *not* need to set the profile.
    #
    with dynamic_batching:
        # NOTE: We can use any shape that's in the range of the profile without
        # additional setup - Polygraphy handles the details behind the scenes!
        #
        large_input_batch = np.repeat(input_img, 16, axis=0)  # Shape: (16, 3, 28, 28)
        outputs = dynamic_batching.infer({"X": large_input_batch})
        assert np.array_equal(outputs["Y"], large_input_batch)

        print("Dynamic batching runner succeeded!")

    with offline:
        # NOTE: We must set the profile to something other than 0 or 1 since both of those
        # are now in use by the `low_latency` and `dynamic_batching` runners respectively.
        #
        offline.set_profile(2)  # Use the third profile, which is intended for the offline case.

        large_offline_batch = np.repeat(input_img, 128, axis=0)  # Shape: (128, 3, 28, 28)
        outputs = offline.infer({"X": large_offline_batch})
        assert np.array_equal(outputs["Y"], large_offline_batch)

        print("Offline runner succeeded!")