Пример #1
0
    def __call__(self):
        """
        Builds a TensorRT engine.

        Returns:
            trt.ICudaEngine: The engine that was created.
        """
        # If network is a callable, then we own its return value
        ret, owning = misc.try_call(self._network)
        builder, network, parser = misc.unpack_args(ret, num=3)

        with contextlib.ExitStack() as stack:
            provided = "Builder and Network" if parser is None else "Builder, Network, and Parser"
            if owning:
                stack.enter_context(builder)
                stack.enter_context(network)
                if parser is not None:
                    stack.enter_context(parser)
            else:
                G_LOGGER.verbose("{:} were provided directly instead of via a Callable. This loader will not assume ownership. "
                               "Please ensure that they are freed.".format(provided))

            network_log_mode = "full" if G_LOGGER.severity <= G_LOGGER.ULTRA_VERBOSE else "attrs"
            G_LOGGER.super_verbose(lambda: ("Displaying TensorRT Network:\n" + trt_util.str_from_network(network, mode=network_log_mode)))

            config, _ = misc.try_call(self._config, builder, network)
            G_LOGGER.info("Building engine with configuration: {:}".format(trt_util.str_from_config(config)))
            engine = builder.build_engine(network, config)
            if not engine:
                G_LOGGER.critical("Invalid Engine. Please ensure the engine was built correctly")

            if hasattr(config.int8_calibrator, "free"):
                config.int8_calibrator.free()

            return engine
Пример #2
0
 def _check_type(self, elem):
     if not isinstance(elem, self.elem_type):
         G_LOGGER.critical(
             "Unsupported element type type in {:}. Element: {:} is type: {:} but type: {:} was expected"
             .format(
                 type(self).__name__, repr(elem),
                 type(elem).__name__, self.elem_type.__name__))
Пример #3
0
    def add_onnx_loader(self, script, disable_outputs=None, suffix=None):
        if self.model_args.model_type == "onnx":
            script.add_import(imports=["OnnxFromPath"],
                              frm="polygraphy.backend.onnx")
            loader_str = Script.invoke("OnnxFromPath",
                                       self.model_args.model_file)
            loader_name = script.add_loader(loader_str,
                                            "load_onnx",
                                            suffix=suffix)
        else:
            if self.tf2onnx_loader_args is None:
                G_LOGGER.critical(
                    "Could not load: {:}. Is it an ONNX model?".format(
                        self.model_args.model_file))
            loader_name = self.tf2onnx_loader_args.add_to_script(script)

        modify_onnx_str = self._get_modify_onnx_str(
            script, loader_name, disable_outputs=disable_outputs)
        if modify_onnx_str is not None:
            loader_name = script.add_loader(modify_onnx_str, "modify_onnx")

        SAVE_ONNX = "SaveOnnx"
        save_onnx_str = Script.invoke(SAVE_ONNX,
                                      loader_name,
                                      path=self.save_onnx)
        if save_onnx_str != Script.invoke(SAVE_ONNX, loader_name):
            script.add_import(imports=[SAVE_ONNX],
                              frm="polygraphy.backend.onnx")
            loader_name = script.add_loader(save_onnx_str, "save_onnx")

        return loader_name
Пример #4
0
    def infer(self, feed_dict):
        start = time.time()
        [
            self.input_buffers[name].device.copy_from(buffer, self.stream)
            for name, buffer in feed_dict.items()
        ]
        # We will not run with smaller batch sizes than whatever the builder chose.
        bindings = [
            buf.device.address() for buf in self.input_buffers.values()
        ] + [buf.device.address() for buf in self.output_buffers.values()]
        status = self.context.execute_async(
            batch_size=self.context.engine.max_batch_size,
            bindings=bindings,
            stream_handle=self.stream.address())
        if not status:
            G_LOGGER.critical(
                "Model execution failed. Please see the log messages above for details"
            )

        for out in self.output_buffers.values():
            out.host = out.device.copy_to(out.host, self.stream)

        self.stream.synchronize()
        end = time.time()

        out_dict = OrderedDict()
        for (name, out) in self.output_buffers.items():
            out_dict[name] = out.host
        self.inference_time = end - start
        return out_dict
Пример #5
0
    def __call__(self):
        uff_model, input_names, input_shapes, output_names = self.uff_loader()

        builder = trt.Builder(TRT_LOGGER)
        network = builder.create_network()
        parser = trt.UffParser()
        # Input names should come from the converter, as a preprocessing script may have been applied to the frozen model.
        for name, shape in zip(input_names, input_shapes):
            # Default order is NCHW, only set to NHWC if we're reasonably certain that it is.
            input_order = self.uff_order
            if not self.uff_order:
                input_order = trt.UffInputOrder.NCHW
                if FormatManager.determine_format(shape) == DataFormat.NHWC:
                    input_order = trt.UffInputOrder.NHWC
            shape = shape[1:]
            G_LOGGER.verbose(
                "Registering UFF input: {:} with shape: {:} and input order: {:}"
                .format(name, shape, input_order))
            parser.register_input(name, shape, input_order)

        if output_names and output_names != constants.MARK_ALL:
            for name in output_names:
                G_LOGGER.verbose("Registering UFF output: " + str(name))
                parser.register_output(name)

        G_LOGGER.info(
            "Parsing UFF model with inputs: {:} and outputs: {:}".format(
                input_names, output_names))
        success = parser.parse_buffer(uff_model, network)
        if not success:
            G_LOGGER.critical("Could not parse UFF correctly")
        return builder, network, parser, input_shapes[0][0]
Пример #6
0
    def infer_impl(self, feed_dict):
        start_binding, _ = self.set_shapes_from_feed_dict(feed_dict)

        start = time.time()

        for name, buffer in feed_dict.items():
            self.device_buffers[name].copy_from(buffer, self.stream)

        # Need to offset bindings in case the active profile is not 0.
        status = self.context.execute_async_v2(
            bindings=[0] * start_binding +
            [buf.address() for buf in self.device_buffers.values()],
            stream_handle=self.stream.address())
        if not status:
            G_LOGGER.critical(
                "Model execution failed. Please see the log messages above for details"
            )

        for name, buffer in self.host_output_buffers.items():
            self.host_output_buffers[name] = self.device_buffers[name].copy_to(
                buffer, self.stream)

        self.stream.synchronize()

        end = time.time()
        self.inference_time = end - start

        return self.host_output_buffers
Пример #7
0
    def set_shapes_from_feed_dict(self, feed_dict):
        """
        Sets context shapes according to the provided feed_dict, then resizes
        buffers as needed.

        Args:
            feed_dict (OrderedDict[str, numpy.ndarray]): A mapping of input tensor names to corresponding input NumPy arrays.

        Returns:
            Tuple[int, int]: The start and end binding indices of the modified bindings.
        """
        def is_dynamic_shape_input(binding):
            try:
                self.context.engine.get_profile_shape_input(0, binding)
                return True
            except RuntimeError:
                return False

        start_binding, end_binding = trt_util.get_active_profile_bindings(
            self.context)
        for name, inp in feed_dict.items():
            binding = start_binding + self.context.engine[name]
            shape = inp.shape
            # Only set shapes if required.
            # get_shape/get_binding_shape will return what a shape input/data input is currently set to.
            if is_dynamic_shape_input(binding):  # For input shape tensors
                G_LOGGER.verbose(
                    "Setting shape binding: {:} (index: {:}) to: {:}".format(
                        name, binding, inp))
                if tuple(self.context.get_shape(binding)) != tuple(inp):
                    self.context.set_shape_input(binding, inp)

            elif misc.is_shape_dynamic(
                    self.context.engine.get_binding_shape(binding)):
                G_LOGGER.verbose(
                    "Setting binding: {:} (index: {:}) to shape: {:}".format(
                        name, binding, shape))
                if tuple(self.context.get_binding_shape(binding)) != tuple(
                        shape):
                    self.context.set_binding_shape(binding, shape)

        if not self.context.all_binding_shapes_specified:
            G_LOGGER.critical(
                "Some input shapes were not specified.\nNote: Network inputs are: {:}"
                .format(self.get_input_metadata()))
        if not self.context.all_shape_inputs_specified:
            G_LOGGER.critical(
                "Some shape inputs were not specified.\nNote: Network inputs are: {:}"
                .format(self.get_input_metadata()))

        # Resize device buffers - host buffers will be automatically resized by copy_to
        for binding in range(start_binding, end_binding):
            name = self.context.engine[
                binding -
                start_binding]  # Use profile 0 binding names for all buffers.
            shape = tuple(self.context.get_binding_shape(binding))
            self.device_buffers[name].resize(shape)

        return start_binding, end_binding
Пример #8
0
 def parse_dtype(dtype):
     if dtype is not None:
         if dtype not in misc.NP_TYPE_FROM_STR:
             G_LOGGER.critical(
                 "Could not understand data type: {:}. Please use one of: {:} or `auto`"
                 .format(dtype, list(misc.NP_TYPE_FROM_STR.keys())))
         dtype = misc.NP_TYPE_FROM_STR[dtype]
     return dtype
Пример #9
0
def parse_profile_shapes(default_shapes, min_args, opt_args, max_args):
    """
    Parses TensorRT profile options from command-line arguments.

    Args:
        default_shapes (TensorMetadata): The inference input shapes.

    Returns:
     List[Tuple[OrderedDict[str, Shape]]]:
            A list of profiles with each profile comprised of three dictionaries
            (min, opt, max) mapping input names to shapes.
    """
    def get_shapes(lst, idx):
        nonlocal default_shapes
        default_shapes = copy.copy(default_shapes)
        if idx < len(lst):
            default_shapes.update(parse_meta(lst[idx], includes_dtype=False))

        # Don't care about dtype, and need to override dynamic dimensions
        shapes = {
            name: misc.override_dynamic_shape(shape)
            for name, (_, shape) in default_shapes.items()
        }

        for name, shape in shapes.items():
            if tuple(shapes[name]) != tuple(shape):
                G_LOGGER.warning(
                    "Input tensor: {:} | For TensorRT profile, overriding shape: {:} to: {:}"
                    .format(name, shape, shapes[name]),
                    mode=LogMode.ONCE)

        return shapes

    num_profiles = max(len(min_args), len(opt_args), len(max_args))

    # For cases where input shapes are provided, we have to generate a profile
    if not num_profiles and default_shapes:
        num_profiles = 1

    profiles = []
    for idx in range(num_profiles):
        min_shapes = get_shapes(min_args, idx)
        opt_shapes = get_shapes(opt_args, idx)
        max_shapes = get_shapes(max_args, idx)
        if sorted(min_shapes.keys()) != sorted(opt_shapes.keys()):
            G_LOGGER.critical(
                "Mismatch in input names between minimum shapes ({:}) and optimum shapes "
                "({:})".format(list(min_shapes.keys()),
                               list(opt_shapes.keys())))
        elif sorted(opt_shapes.keys()) != sorted(max_shapes.keys()):
            G_LOGGER.critical(
                "Mismatch in input names between optimum shapes ({:}) and maximum shapes "
                "({:})".format(list(opt_shapes.keys()),
                               list(max_shapes.keys())))

        profiles.append((min_shapes, opt_shapes, max_shapes))
    return profiles
Пример #10
0
    def __getitem__(self, key):
        if isinstance(key, int):
            return super().__getitem__(key)

        for name, iteration_results in self:
            if name == key:
                return iteration_results

        G_LOGGER.critical(
            "Runner: {:} does not exist in this RunResults instance. Note: Available runners: {:}"
            .format(key, list(self.keys())))
Пример #11
0
                    def get_tol(tol_dict):
                        if isinstance(tol_dict, numbers.Number):
                            return tol_dict

                        if out0_name in tol_dict:
                            return tol_dict[out0_name]
                        elif "" in tol_dict:
                            return tol_dict[""]

                        G_LOGGER.critical("Could not find a tolerance for output: '{:}' in the provided tolerance map: {:}.\n"
                                          "Note: Use a key of `""` in the map to specify a default tolerance.".format(out0_name, tol_dict))
Пример #12
0
 def pop_meta(name):
     nonlocal tensor_meta_arg
     tensor_meta_arg, _, val = tensor_meta_arg.rpartition(SEP)
     if not tensor_meta_arg:
         G_LOGGER.critical(
             "Could not parse {:} from argument: {:}. Is it separated by a comma "
             "(,) from the tensor name?".format(name,
                                                orig_tensor_meta_arg))
     if val.lower() == "auto":
         val = None
     return val
Пример #13
0
    def __getitem__(self, key):
        """
        Retrieves the shapes registered for a given input name.

        Returns:
            ShapeTuple:
                    A named tuple include ``min``, ``opt``, and ``max`` members for the shapes
                    corresponding to the input.
        """
        if key not in self:
            G_LOGGER.critical("Binding: {:} does not have shapes set in this profile".format(key))
        return super().__getitem__(key)
Пример #14
0
 def _check_types(self, key, val):
     if not isinstance(key, self.key_type):
         G_LOGGER.critical(
             "Unsupported key type in {:}. Key: {:} is type `{:}` but {:} expects type `{:}`"
             .format(self, repr(key),
                     type(key).__name__,
                     type(self).__name__, self.key_type.__name__))
     if not isinstance(val, self.value_type):
         G_LOGGER.critical(
             "Unsupported value type in {:}. Value: {:} for key: {:} is type `{:}` but {:} expects type `{:}`"
             .format(self, repr(val), repr(key),
                     type(val).__name__,
                     type(self).__name__, self.value_type.__name__))
Пример #15
0
    def __call__(self):
        """
        Deserializes an engine from a buffer.

        Returns:
            trt.ICudaEngine: The deserialized engine.
        """
        buffer, _ = misc.try_call(self._serialized_engine)

        trt.init_libnvinfer_plugins(trt_util.TRT_LOGGER, "")
        with trt.Runtime(trt_util.TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(buffer)
            if not engine:
                G_LOGGER.critical("Could not load engine")
        return engine
Пример #16
0
 def get_static_shape(name, shape):
     static_shape = shape
     if misc.is_shape_dynamic(shape):
         static_shape = misc.override_dynamic_shape(shape)
         if static_shape != shape and name not in self.user_input_metadata:
             if not misc.is_valid_shape_override(static_shape, shape):
                 G_LOGGER.critical(
                     "Input tensor: {:24} | Cannot override original shape: {:} to {:}"
                     .format(name, shape, static_shape))
             G_LOGGER.warning(
                 "Input tensor: {:24} | Adjusted shape: {:} to: {:}. If this is incorrect, please set input_metadata "
                 "or provide a custom data loader.".format(
                     name, shape, static_shape),
                 mode=LogMode.ONCE)
     return static_shape
Пример #17
0
    def activate_impl(self):
        def make_buffers(engine):
            """
            Creates empty host and device buffers for the specified engine.
            Always uses binding names from Profile 0.
            """
            device_buffers = OrderedDict()
            host_output_buffers = OrderedDict()

            for idx in range(trt_util.get_bindings_per_profile(engine)):
                binding = engine[idx]
                dtype = trt.nptype(engine.get_binding_dtype(binding))
                device_buffers[binding] = cuda.DeviceBuffer(dtype=dtype)
                if not engine.binding_is_input(binding):
                    host_output_buffers[binding] = np.empty(shape=tuple(),
                                                            dtype=dtype)
            G_LOGGER.extra_verbose(
                "Created device buffers: {:}".format(device_buffers))
            return device_buffers, host_output_buffers

        engine_or_context, owning = misc.try_call(self._engine_or_context)

        self.engine, self.owns_engine = None, False
        self.context, self.owns_context = None, False

        if isinstance(engine_or_context, trt.ICudaEngine):
            self.engine = engine_or_context
            self.owns_engine = owning
            self.context = self.engine.create_execution_context()
            if not self.context:
                G_LOGGER.critical(
                    "Invalid Context. See error log for details.")
        elif isinstance(engine_or_context, trt.IExecutionContext):
            self.context = engine_or_context
            self.owns_context = owning
        else:
            G_LOGGER.critical(
                "Invalid Engine or Context. Please ensure the engine was built correctly. See error log for details."
            )

        if not owning:
            G_LOGGER.verbose(
                "Object was provided directly instead of via a Callable. This runner will not assume ownership. "
                "Please ensure it is freed.")

        self.device_buffers, self.host_output_buffers = make_buffers(
            self.context.engine)
        self.stream = cuda.Stream()
Пример #18
0
    def activate_impl(self):
        # If engine is a callable, then we own the engine
        self.engine, self.owning = misc.try_call(self._engine)

        if not self.engine:
            G_LOGGER.critical(
                "Invalid Engine. Please ensure the engine was built correctly")

        if not self.owning:
            G_LOGGER.verbose(
                "Engine was provided directly instead of via a Callable. This runner will not assume ownership. "
                "Please ensure the engine is freed.")

        self.buffers = Buffers.from_engine(self.engine)
        self.stream = cuda.Stream()

        self.context = self.engine.create_execution_context()
Пример #19
0
    def __init__(self, deploy, model, outputs, batch_size=None, dtype=None):
        self.deploy = deploy
        self.model = model
        if not self.model:
            G_LOGGER.warning(
                "No model file provided for Caffe model, random weights will be used. To avoid this, "
                "please set the model paramater, or --model")

        if not outputs:
            G_LOGGER.critical(
                "Please set Caffe model outputs using the outputs parameter, or --trt-outputs. "
                "Note: To determine possible outputs, try running: tail -n50 {:}"
                .format(deploy))

        self.outputs = outputs
        self.dtype = misc.default_value(dtype, trt.float32)
        self.batch_size = misc.default_value(batch_size, 1)
Пример #20
0
    def __call__(self):
        """
        Creates an empty TensorRT network.

        Returns:
            (trt.Builder, trt.INetworkDefinition): The builder and empty network.
        """
        builder = trt.Builder(trt_util.TRT_LOGGER)
        network_flags = 0
        if self.explicit_batch:
            network_flags |= 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        if self.explicit_precision:
            network_flags |= 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_PRECISION)
        network = builder.create_network(flags=network_flags)
        if network is None:
            G_LOGGER.critical("Invalid network. See logging output above for details.")
        return builder, network
Пример #21
0
        def determine_model_type():
            if tools_util.get(args, "model_type") is not None:
                return args.model_type.lower()

            if tools_util.get(args, "model_file") is None:
                return None

            def use_ext(ext_mapping):
                file_ext = os.path.splitext(args.model_file)[-1]
                if file_ext in ext_mapping:
                    return ext_mapping[file_ext]

            runners = misc.default_value(tools_util.get(args, "runners"), [])
            if tools_util.get(args, "ckpt") or os.path.isdir(args.model_file):
                return "ckpt"
            elif "tf" in runners or "trt_legacy" in runners:
                if args.caffe_model:
                    return "caffe"
                ext_mapping = {
                    ".hdf5": "keras",
                    ".uff": "uff",
                    ".prototxt": "caffe",
                    ".onnx": "onnx",
                    ".engine": "engine",
                    ".plan": "engine"
                }
                return use_ext(ext_mapping) or "frozen"
            else:
                # When no framework is provided, some extensions can be ambiguous
                ext_mapping = {
                    ".hdf5": "keras",
                    ".graphdef": "frozen",
                    ".onnx": "onnx",
                    ".uff": "uff",
                    ".engine": "engine",
                    ".plan": "engine"
                }
                model_type = use_ext(ext_mapping)
                if model_type:
                    return model_type

            G_LOGGER.critical(
                "Could not automatically determine model type for: {:}\n"
                "Please explicitly specify the type with the --model-type option"
                .format(args.model_file))
Пример #22
0
def parse_profile_shapes(shapes, min_args, opt_args, max_args):
    def get_shapes(lst, idx):
        default_shapes = copy.copy(shapes)
        if idx < len(lst):
            default_shapes.update(parse_meta(lst[idx], includes_dtype=False))
        # Don't care about dtype, and need to override dynamic dimensions
        default_shapes = {
            name: misc.override_dynamic_shape(shape)
            for name, (_, shape) in default_shapes.items()
        }

        for name, (_, shape) in shapes.items():
            if tuple(default_shapes[name]) != tuple(shape):
                G_LOGGER.warning(
                    "Input tensor: {:} | For TensorRT profile, overriding shape: {:} to: {:}"
                    .format(name, shape, default_shapes[name]),
                    mode=LogMode.ONCE)

        return default_shapes

    num_profiles = max(len(min_args), len(opt_args), len(max_args))

    # For cases where input shapes are provided, we have to generate a profile
    if not num_profiles and shapes:
        num_profiles = 1

    profiles = []
    for idx in range(num_profiles):
        min_shapes = get_shapes(min_args, idx)
        opt_shapes = get_shapes(opt_args, idx)
        max_shapes = get_shapes(max_args, idx)
        if sorted(min_shapes.keys()) != sorted(opt_shapes.keys()):
            G_LOGGER.critical(
                "Mismatch in input names between minimum shapes ({:}) and optimum shapes "
                "({:})".format(list(min_shapes.keys()),
                               list(opt_shapes.keys())))
        elif sorted(opt_shapes.keys()) != sorted(max_shapes.keys()):
            G_LOGGER.critical(
                "Mismatch in input names between optimum shapes ({:}) and maximum shapes "
                "({:})".format(list(opt_shapes.keys()),
                               list(max_shapes.keys())))

        profiles.append((min_shapes, opt_shapes, max_shapes))
    return profiles
Пример #23
0
    def __call__(self):
        """
        Loads a TensorFlow model from a checkpoint.

        Returns:
            Tuple[tf.Graph, Sequence[str]]: The TensorFlow graph, and the names of its outputs.
        """
        # If `name` is not provided, this expects that the directory contains a `checkpoint` file with the contents:
        #
        # model_checkpoint_path: "model"
        # all_model_checkpoint_paths: "model"
        #
        # where "model" is the checkpoint name
        if self.name is None:
            G_LOGGER.verbose(
                "Checkpoint name was not explicitly provided, searching for `checkpoint` file"
            )
            checkpoint = tf.train.get_checkpoint_state(self.dir)
            if checkpoint is None:
                ckpt_file_contents = '\nmodel_checkpoint_path: "model"\nall_model_checkpoint_paths: "model"\n'
                G_LOGGER.critical(
                    "Checkpoint directory: {:} does not contain a `checkpoint` file, and the checkpoint name was"
                    "not provided. Please either create a checkpoint file with the contents:\n{:}"
                    "\nWhere `model` is the name of the checkpoint, or explicitly provide the name with"
                    "--ckpt, not including file extensions".format(
                        self.dir, ckpt_file_contents))
            input_checkpoint = checkpoint.model_checkpoint_path
        else:
            input_checkpoint = os.path.join(self.dir, self.name)

        meta_file = input_checkpoint + '.meta'
        with tf.Graph().as_default() as graph, tf.compat.v1.Session(
                graph=graph).as_default() as sess:
            saver = tf.compat.v1.train.import_meta_graph(meta_file,
                                                         clear_devices=True)
            saver.restore(sess, input_checkpoint)
            return graph, tf_util.get_graph_output_names(graph)
Пример #24
0
    def infer(self, feed_dict):
        def is_dynamic_shape_input(binding):
            try:
                self.engine.get_profile_shape_input(0, binding)
                return True
            except RuntimeError:
                return False

        start_binding, end_binding = trt_util.get_active_profile_bindings(
            self.engine, self.context)
        for name, inp in feed_dict.items():
            binding = start_binding + self.engine[name]
            shape = inp.shape
            # Only set shapes if required.
            # get_shape/get_binding_shape will return what a shape input/data input is currently set to.
            if is_dynamic_shape_input(binding):
                G_LOGGER.verbose(
                    "Setting shape binding: {:} (index: {:}) to: {:}".format(
                        name, binding, inp))
                if tuple(self.context.get_shape(binding)) != tuple(inp):
                    self.context.set_shape_input(binding, inp)

            elif misc.is_shape_dynamic(self.engine.get_binding_shape(binding)):
                G_LOGGER.verbose(
                    "Setting binding: {:} (index: {:}) to shape: {:}".format(
                        name, binding, shape))
                if tuple(self.context.get_binding_shape(binding)) != tuple(
                        shape):
                    self.context.set_binding_shape(binding, shape)

        if not self.context.all_binding_shapes_specified:
            G_LOGGER.critical(
                "Some input shapes were not specified.\nNote: Network inputs are: {:}"
                .format(self.get_input_metadata()))
        if not self.context.all_shape_inputs_specified:
            G_LOGGER.critical(
                "Some shape inputs were not specified.\nNote: Network inputs are: {:}"
                .format(self.get_input_metadata()))

        # Inference
        # Need to resize output buffers
        self.buffers.resize(self.engine,
                            self.context,
                            start_binding=start_binding,
                            end_binding=end_binding)

        start = time.time()
        self.buffers.copy_inputs(feed_dict, self.stream)
        # Need to offset bindings in case the active profile is not 0.
        status = self.context.execute_async_v2(
            bindings=[0] * start_binding + self.buffers.bindings(),
            stream_handle=self.stream.address())
        if not status:
            G_LOGGER.critical(
                "Model execution failed. Please see the log messages above for details"
            )

        self.buffers.copy_outputs(self.stream)
        self.stream.synchronize()
        end = time.time()

        self.inference_time = end - start
        return self.buffers.outputs
Пример #25
0
 def check(self, status):
     if status != 0:
         G_LOGGER.critical(
             "CUDA Error: {:}. To figure out what this means, refer to "
             "https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038"
             .format(status))
Пример #26
0
 def __init__(self):
     self.handle = ctypes.CDLL("libcudart.so")
     if not self.handle:
         G_LOGGER.critical(
             "Could not load the CUDA runtime library. Is it on your loader path?"
         )
Пример #27
0
    def activate_impl(self):
        """
        Vars:
            engine (trt.ICudaEngine):
                    The engine tracked by this runner. The TrtLegacyRunner OWNS the engine it
                    manages, and therefore is responsible for it's destruction. Do not free the engine outside of the
                    runner, or it will result in a double free.
            context (trt.IExecutionContext): The context used for inference.
            input_buffers (Dict[str, TrtLegacyRunner.HostDeviceMem]):
                    A mapping of binding names to HostDeviceMem objects for input buffers.
            output_buffers (Dict[str, TrtLegacyRunner.HostDeviceMem]):
                    A mapping of binding names to HostDeviceMem objects for output buffers.
            bindings (List[int]): A list of device pointers for engine bindings.
            stream (cuda.Stream): The CUDA stream that this runner will use for inference.
        """

        # Only initialize GPU after this runner is activated.
        # Allocates all buffers required for an engine, i.e. host/device input_buffers/output_buffers.
        def allocate_buffers(engine):
            input_buffers = OrderedDict()
            output_buffers = OrderedDict()
            bindings = []
            stream = cuda.Stream()
            G_LOGGER.verbose("Using batch size: " +
                             str(engine.max_batch_size) +
                             " during buffer allocation")
            for binding in engine:
                shape = (engine.max_batch_size, ) + tuple(
                    engine.get_binding_shape(binding))
                dtype = engine.get_binding_dtype(binding)

                device_mem = cuda.DeviceBuffer(shape=shape,
                                               dtype=trt.nptype(dtype))
                G_LOGGER.extra_verbose("Tensor: "
                                       "{:40} | Allocated: {:}".format(
                                           binding, device_mem))

                if engine.binding_is_input(binding):
                    input_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        None, device_mem)
                else:
                    host_mem = np.empty(shape=shape, dtype=trt.nptype(dtype))
                    output_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        host_mem, device_mem)
            return input_buffers, output_buffers, stream

        # Always try reading the engine first, or, failing that, build it.
        if self.load_engine:
            with open(self.load_engine,
                      "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
                G_LOGGER.info("Reading engine from {:}".format(
                    self.load_engine))
                self.engine = runtime.deserialize_cuda_engine(f.read())
        else:
            trt.init_libnvinfer_plugins(TRT_LOGGER, "")
            builder, network, parser, model_batch_size = self.network_loader()
            with builder, network, parser:
                builder.max_batch_size = int(self.max_batch_size
                                             or model_batch_size or 1)

                config = builder.create_builder_config()
                config.max_workspace_size = int(self.max_workspace_size)

                if not self.tf32:
                    with contextlib.suppress(AttributeError):
                        config.clear_flag(trt.BuilderFlag.TF32)
                if self.fp16:
                    config.flags = 1 << int(trt.BuilderFlag.FP16)

                if not network:
                    G_LOGGER.critical("Invalid network")
                G_LOGGER.super_verbose(lambda: trt_util.str_from_network(
                    network) or "Finished logging network")

                if self.layerwise:
                    # In layerwise mode, every layer becomes an output.
                    G_LOGGER.info(
                        "Running in layerwise mode. Marking {:} layers as outputs"
                        .format(network.num_layers))
                    for layer in network:
                        for index in range(layer.num_outputs):
                            out = layer.get_output(index)
                            if not out.is_network_output:
                                network.mark_output(out)

                G_LOGGER.info(
                    "Building engine: max workspace size={:} bytes, max batch size={:}, fp16={:}, "
                    "tf32={:}".format(builder.max_workspace_size,
                                      builder.max_batch_size, self.fp16,
                                      self.tf32))
                self.engine = builder.build_engine(network, config)

        if not self.engine:
            G_LOGGER.critical(
                "Invalid Engine. Please ensure the engine was built correctly")

        if self.engine_path:
            with open(self.engine_path, "wb") as f:
                G_LOGGER.info("Writing engine to {:}".format(self.engine_path))
                f.write(self.engine.serialize())

        self.context = self.engine.create_execution_context()
        self.input_buffers, self.output_buffers, self.stream = allocate_buffers(
            self.engine)
Пример #28
0
    def __getitem__(self, iteration):
        """
        Load the specified iteration from the cache if present, or generate using the data loader.

        Args:
            iteration (int): The iteration whose data to retrieve.
        """
        if iteration > len(self.cache):
            raise IndexError()

        # Attempts to match existing input buffers to the requested input_metadata
        def coerce_cached_input(index, name, dtype, shape):
            cached_feed_dict = self.cache[iteration]
            cached_name = misc.find_in_dict(name, cached_feed_dict, index)
            assert cached_name is not None

            if cached_name != name:
                G_LOGGER.warning(
                    "Input tensor: {:24} | Cached buffer name ({:}) does not match input name ({:})."
                    .format(name, cached_name, name))

            buffer = cached_feed_dict[cached_name]

            if dtype != buffer.dtype:
                G_LOGGER.warning(
                    "Input tensor: {:24} | Cached buffer dtype ({:}) does not match input dtype ({:}), attempting cast. "
                    .format(name, buffer.dtype,
                            np.dtype(dtype).name))
                buffer = buffer.astype(dtype)

            if not misc.is_valid_shape_override(buffer.shape, shape):
                G_LOGGER.warning(
                    "Input tensor: {:24} | Cached buffer shape ({:}) does not match input shape ({:}), attempting reshape. "
                    .format(name, buffer.shape, shape))
                buffer = misc.try_match_shape(buffer, shape)

            assert buffer.dtype == dtype and misc.is_valid_shape_override(
                buffer.shape, shape)
            return buffer

        feed_dict = OrderedDict()

        # Reload from data loader if needed
        data_loader_feed_dict = None

        for index, (name, (dtype,
                           shape)) in enumerate(self.input_metadata.items()):
            try:
                buffer = coerce_cached_input(index, name, dtype, shape)
            except AssertionError:
                G_LOGGER.warning(
                    "Could not reuse input: {:} across runners. Attempting to reload "
                    "inputs from the data loader. Note that this will only work if the data loader "
                    "supports random access.".format(name))
                try:
                    if data_loader_feed_dict is None:
                        data_loader_feed_dict = self.data_loader[iteration]
                    buffer = data_loader_feed_dict[name]
                except:
                    G_LOGGER.critical(
                        "Could not reload inputs from data loader. Are the runners running the same model? "
                        "If not, please rewrite the data loader to support random access."
                    )
            feed_dict[name] = buffer

        return feed_dict
Пример #29
0
    def run(runners,
            data_loader=None,
            warm_up=None,
            use_subprocess=None,
            subprocess_timeout=None,
            subprocess_polling_interval=None,
            save_inputs_path=None):
        """
        Runs the supplied runners sequentially.

        Args:
            runners (List[BaseRunner]):
                    A list of runners to run.
            data_loader (Generator -> OrderedDict[str, numpy.ndarray]):
                    A generator or iterable that yields a dictionary that maps input names to input numpy buffers.
                    In the simplest case, this can be a `List[Dict[str, numpy.ndarray]]` .

                    In case you don't know details about the inputs ahead of time, you can access the
                    `input_metadata` property in your data loader, which will be set to an `TensorMetadata`
                    instance by this function.
                    Note that this does not work for generators or lists.

                    The number of iterations run by this function is controlled by the number of items supplied
                    by the data loader.

                    Defaults to an instance of `DataLoader`.
            warm_up (int):
                    The number of warm up runs to perform for each runner before timing.
                    Defaults to 0.
            use_subprocess (bool):
                    Whether each runner should be run in a subprocess. This allows each runner to have exclusive
                    access to the GPU. When using a subprocess, runners and loaders will never be modified.
            subprocess_timeout (int):
                    The timeout before a subprocess is killed automatically. This is useful for handling processes
                    that never terminate. A value of None disables the timeout. Defaults to None.
            subprocess_polling_interval (int):
                    The polling interval, in seconds, for checking whether a subprocess has completed or crashed.
                    In rare cases, omitting this parameter when subprocesses are enabled may cause this function
                    to hang indefinitely if the subprocess crashes.
                    A value of 0 disables polling. Defaults to 30 seconds.
            save_inputs_path (str):
                    [EXPERIMENTAL] Path at which to save inputs used during inference. This will include all inputs generated by
                    the provided data_loader, and will be saved as a pickled List[Dict[str, numpy.ndarray]].

        Returns:
            RunResults:
                    A mapping of runner names to the results of their inference.
                    The ordering of `runners` is preserved in this mapping.
        """
        warm_up = misc.default_value(warm_up, 0)
        data_loader = misc.default_value(data_loader, DataLoader())
        use_subprocess = misc.default_value(use_subprocess, False)
        subprocess_polling_interval = misc.default_value(
            subprocess_polling_interval, 30)
        loader_cache = DataLoaderCache(data_loader,
                                       save_inputs_path=save_inputs_path)

        def execute_runner(runner, loader_cache):
            with runner as active_runner:
                input_metadata = active_runner.get_input_metadata()
                G_LOGGER.info("Runner: {:40} | Input Metadata: {:}".format(
                    active_runner.name, input_metadata),
                              mode=LogMode.ONCE)
                # DataLoaderCache will ensure that the feed_dict does not contain any extra entries
                # based on the provided input_metadata.
                loader_cache.set_input_metadata(input_metadata)

                if warm_up:
                    G_LOGGER.start(
                        "Runner: {:40} | Running {:} warm-up runs".format(
                            active_runner.name, warm_up))
                    try:
                        feed_dict = loader_cache[0]
                    except IndexError:
                        G_LOGGER.warning(
                            "{:} warm-up runs were requested, but data loader did not supply any data. "
                            "Skipping warm-up runs".format(warm_up))
                    else:
                        G_LOGGER.ultra_verbose(
                            "Warm-up Input Buffers:\n{:}".format(
                                misc.indent_block(feed_dict)))
                        # First do a few warm-up runs, and don't time them.
                        for i in range(warm_up):
                            active_runner.infer(feed_dict=feed_dict)

                # Then, actual iterations.
                index = 0
                iteration_results = []
                output_metadata = TensorMetadata()

                for index, feed_dict in enumerate(loader_cache):
                    G_LOGGER.extra_verbose(
                        lambda: "Runner: {:40} | Feeding inputs:\n{:}".format(
                            active_runner.name, misc.indent_block(feed_dict)))
                    outputs = active_runner.infer(feed_dict=feed_dict)

                    runtime = active_runner.last_inference_time()
                    # Without a deep copy here, outputs will always reference the output of the last run
                    iteration_results.append(
                        IterationResult(outputs=copy.deepcopy(outputs),
                                        runtime=runtime,
                                        runner_name=active_runner.name))

                    if index == 0:
                        for name, out in outputs.items():
                            output_metadata.add(name, out.dtype, out.shape)

                    G_LOGGER.info(
                        "Runner: {:40} | Output Metadata: {:}".format(
                            active_runner.name, output_metadata),
                        mode=LogMode.ONCE)
                    G_LOGGER.extra_verbose(
                        lambda:
                        "Runner: {:40} | Inference Time: {:.3f} ms | Received outputs:\n{:}"
                        .format(active_runner.name, runtime * 1000.0,
                                misc.indent_block(outputs)))

                G_LOGGER.finish(
                    "Runner: {:40} | Completed {:} iterations.".format(
                        active_runner.name, index + 1))
                return iteration_results

        # Wraps execute_runner to use a queue.
        def execute_runner_with_queue(runner_queue, runner, loader_cache):
            iteration_results = None
            try:
                iteration_results = execute_runner(runner, loader_cache)
            except:
                # Cannot send the exception back, as it is not necessarily pickleable
                import traceback
                G_LOGGER.error(traceback.format_exc())
            misc.try_send_on_queue(runner_queue, iteration_results)
            # After finishing, send the updated loader_cache back.
            misc.try_send_on_queue(runner_queue, loader_cache)

        # Do all inferences in one loop, then comparisons at a later stage.
        # We run each runner in a separate process so that we can provide exclusive GPU access for each runner.
        run_results = RunResults()
        for runner in runners:
            G_LOGGER.start(
                "Runner: {:40} | Activating and starting inference".format(
                    runner.name))
            if use_subprocess:
                runner_queue = Queue()
                process = Process(target=execute_runner_with_queue,
                                  args=(runner_queue, runner, loader_cache))
                process.start()

                # If a subprocess hangs in a certain way, then process.join could block forever. Hence,
                # we need to keep polling the process to make sure it really is alive.
                iteration_results = None
                while process.is_alive() and iteration_results is None:
                    try:
                        iteration_results = misc.try_receive_on_queue(
                            runner_queue,
                            timeout=subprocess_polling_interval / 2)
                        # Receive updated loader cache, or fall back if it could not be sent.
                        loader_cache = misc.try_receive_on_queue(
                            runner_queue,
                            timeout=subprocess_polling_interval / 2)
                    except queue.Empty:
                        G_LOGGER.extra_verbose(
                            "Polled subprocess - still running")

                try:
                    assert iteration_results is not None
                    run_results.append((runner.name, iteration_results))
                    process.join(subprocess_timeout)
                except:
                    G_LOGGER.critical(
                        "Runner: {:40} | Terminated prematurely. Check the exception logged above. "
                        "If there is no exception logged above, make sure not to use the --use-subprocess "
                        "flag or set use_subprocess=False in Comparator.run()."
                        .format(runner.name))
                finally:
                    process.terminate()

                if loader_cache is None:
                    G_LOGGER.critical(
                        "Could not send data loader cache to runner subprocess. Please try disabling subprocesses "
                        "by removing the --use-subprocess flag, or setting use_subprocess=False in Comparator.run()"
                    )
            else:
                run_results.append(
                    (runner.name, execute_runner(runner, loader_cache)))

        G_LOGGER.verbose("Successfully ran: {:}".format(
            [r.name for r in runners]))
        return run_results
Пример #30
0
    def run(self, args):
        if not self.arg_groups[OnnxSaveArgs].path and not args.min_good:
            G_LOGGER.critical(
                "--output (where to write the reduced model) and/or "
                "--min-good (where to write a reduced model that passes) must be provided!"
            )

        model = self.arg_groups[OnnxLoaderArgs].load_onnx()
        num_orig_nodes = len(model.graph.node)

        # When --model-input-shapes are set, we need to override the shapes in the model, and then run
        # shape inference to figure out the new shapes of intermediate tensors.
        user_input_metadata = self.arg_groups[ModelArgs].input_shapes
        if user_input_metadata:
            model = gs.export_onnx(
                tools_util.override_input_shapes(onnx_backend.gs_from_onnx(model), user_input_metadata)
            )
            if self.arg_groups[OnnxShapeInferenceArgs].do_shape_inference:
                model = onnx_backend.infer_shapes(model)

        # Lower Constant nodes into Constant tensors
        # If we don't do this, the outputs of Constant nodes may be incorrectly marked
        #   as variable inputs. Further, fallback shape inference does not apply to Constant nodes.
        GRAPH = onnx_util.lower_constant_nodes(onnx_backend.gs_from_onnx(model))

        _layerwise_outputs = None
        _layerwise_meta = None
        # Get metadata inferred by fallback shape inference. If fallback shape inference was
        # never run, then this function runs it.
        def layerwise(model, include_data=False):
            nonlocal _layerwise_outputs, _layerwise_meta
            if _layerwise_outputs is None or _layerwise_meta is None:
                G_LOGGER.info(
                    "Running inference with ONNX-Runtime to determine metadata for intermediate tensors.\n"
                    "This will cause intermediate models to have static shapes."
                )
                _layerwise_outputs, _layerwise_meta = self.arg_groups[OnnxShapeInferenceArgs].fallback_inference(model)
            return _layerwise_outputs if include_data else _layerwise_meta

        if self.arg_groups[OnnxShapeInferenceArgs].force_fallback:
            G_LOGGER.info("Freezing shapes in the model according to values determined by fallback shape inference")
            onnx_util.set_shapes_from_layerwise_meta(GRAPH, layerwise(model))

        def fix_graph(graph, model):
            """
            Fix the graph so it is valid ONNX.
            """

            def fix_tensor_metadata(tensors, fix_shape=True):
                for tensor in tensors:
                    if not tensor.shape and fix_shape:
                        tensor.shape = layerwise(model)[tensor.name].shape
                    if not tensor.dtype:
                        tensor.dtype = layerwise(model)[tensor.name].dtype

            fix_tensor_metadata(graph.inputs)
            fix_tensor_metadata(graph.outputs, fix_shape=False)

            # If we're marking inputs, there may be cases where some other inputs are required - for
            # example, if the model is branchy. If, after cleanup(), there are any Variable tensors in
            # the graph without inputs, we'll replace them with constants and fold them away.
            tensor_map = graph.tensors()
            needs_const_fold = False
            for tensor in tensor_map.values():
                if isinstance(tensor, gs.Variable) and not tensor.inputs and tensor not in graph.inputs:
                    needs_const_fold = True
                    G_LOGGER.info("Freezing model input: {:}".format(tensor))
                    tensor.to_constant(layerwise(model, include_data=True)[tensor.name])

            if needs_const_fold:
                G_LOGGER.info("Folding constants to remove extraneous subgraphs")
                graph.fold_constants().cleanup()

            return graph

        def mark_io(graph, attr, tensors, filter_const=True):
            if filter_const:
                tensors = [t for t in tensors if not isinstance(t, gs.Constant)]

            if not tensors:
                G_LOGGER.warning(
                    "No non-constant tensors are available to mark. "
                    "Try folding constants in the model with `polygraphy surgeon sanitize --fold-constants`"
                )

            setattr(graph, attr, tensors)
            G_LOGGER.info("Marking model {attr}: {:}".format(getattr(graph, attr), attr=attr))
            return graph

        def names_from_tensors(tensors):
            return [t.name for t in tensors]

        def lookup_tensors(graph, names):
            tensor_map = graph.tensors()
            return [tensor_map[name] for name in names]

        # Bisect using the given marker, and modifying the given graph attribute.
        # attr should be one of ["inputs", "outputs"].
        # filter_const indicates whether to filter out constant tensors before updating graph I/O.
        def bisect_io(graph, model, marker, attr, filter_const=True):
            G_LOGGER.start("Reducing model {:}".format(attr))
            iter_graph = graph

            while not marker.stop():
                G_LOGGER.start(
                    "RUNNING | Iteration {:} | Approximately {:} iteration(s) remaining".format(
                        marker.iteration + 1, marker.remaining()
                    )
                )
                iter_graph = graph.copy()  # This is a very light-weight copy of the entire graph.

                with G_LOGGER.indent():
                    io_list = list(getattr(iter_graph.nodes[marker.node_index], attr))
                    mark_io(iter_graph, attr, io_list, filter_const)
                    iter_graph.cleanup()
                    self.arg_groups[OnnxSaveArgs].save_onnx(
                        gs.export_onnx(fix_graph(iter_graph, model)), self.arg_groups[ArtifactSorterArgs].iter_artifact
                    )

                num_nodes = len(iter_graph.nodes)
                success = self.arg_groups[ArtifactSorterArgs].sort_artifacts(
                    marker.iteration + 1, suffix="_reduce_{:}_{:}_nodes".format(attr, num_nodes)
                )
                marker.step(success, num_nodes)

            marker.finish()
            G_LOGGER.finish("Finished reducing model {attr}".format(attr=attr))

            # Find minimal good/bad inputs/outputs, falling back to existing graph inputs/outputs.
            def get_io(index):
                if index is None:
                    return names_from_tensors(getattr(graph, attr))
                return names_from_tensors(list(getattr(graph.nodes[index], attr)))

            return get_io(marker.best_bad_node_index), get_io(marker.best_good_node_index)

        # We reduce the model in 2 phases:
        #   1. Find the earliest output nodes that cause a failure.
        #   2. Find the latest input nodes cause a failure.

        MarkerType = BisectMarker if args.mode == "bisect" else LinearMarker

        bad_graph = GRAPH.copy()

        good_graph = None
        if args.min_good:
            good_graph = GRAPH.copy()

        # == Phase 1 ==

        if args.reduce_outputs:
            out_marker = MarkerType(len(bad_graph.nodes))
            bad_outputs, good_outputs = bisect_io(bad_graph, model, out_marker, attr="outputs", filter_const=False)
            bad_graph = mark_io(bad_graph, "outputs", lookup_tensors(bad_graph, bad_outputs)).cleanup()
            if good_graph is not None:
                good_graph = mark_io(
                    good_graph, "outputs", lookup_tensors(good_graph, good_outputs)
                )  # Defer cleanup where possible.
            # Export the model with the reduced outputs so that reducing inputs is faster.
            model = gs.export_onnx(fix_graph(bad_graph, model))

        # == Phase 2 ==

        if args.reduce_inputs:
            in_marker = MarkerType(len(bad_graph.nodes), invert=True)
            bad_inputs, good_inputs = bisect_io(bad_graph, model, in_marker, attr="inputs")
            bad_graph = mark_io(bad_graph, "inputs", lookup_tensors(bad_graph, bad_inputs)).cleanup()
            if good_graph is not None:
                good_graph = mark_io(
                    good_graph, "inputs", lookup_tensors(good_graph, good_inputs)
                )  # Defer cleanup where possible.

        # == Write Bad Model ==

        reduced_model = gs.export_onnx(fix_graph(bad_graph, model))

        if self.arg_groups[OnnxSaveArgs].path:
            num_reduced_nodes = len(reduced_model.graph.node)

            if (
                float(num_reduced_nodes) / float(num_orig_nodes) >= 0.25
                and num_reduced_nodes > 1
                and args.mode == "bisect"
            ):
                G_LOGGER.warning(
                    "It looks like this model could potentially be reduced further.\n"
                    "You may want to reduce {:} again using --mode=linear. ".format(self.arg_groups[OnnxSaveArgs].path)
                )

            G_LOGGER.info("Minimum Bad Model:\n{:}\n\n".format(onnx_util.str_from_onnx(reduced_model, mode="none")))
            self.arg_groups[OnnxSaveArgs].save_onnx(reduced_model)

        # == Write Good Model ==

        if good_graph is not None:
            min_good_model = gs.export_onnx(fix_graph(good_graph.cleanup(), model))
            if min_good_model == reduced_model:
                G_LOGGER.warning(
                    "Could not find a minimal model close in size to the reduced model that does not cause a failure."
                )
            else:
                G_LOGGER.info(
                    "Minimum Good Model:\n{:}\n\n".format(onnx_util.str_from_onnx(min_good_model, mode="none"))
                )
                self.arg_groups[OnnxSaveArgs].save_onnx(min_good_model, args.min_good)