def test_device_buffer_order_matches_bindings(self): model = ONNX_MODELS["reducable"] engine = engine_from_network(NetworkFromOnnxBytes(model.loader)) with engine, TrtRunner(engine) as runner: dev_buf_order = list(runner.device_buffers.keys()) for binding, dev_buf_name in zip(engine, dev_buf_order): assert binding == dev_buf_name
def onnx_to_trt(self, output_fpath: str, input_fpath: str, network_metadata: NetworkMetadata): """ Converts ONNX file to TRT engine. Since TensorRT already supplies converter functions and scripts, a default implementation is already provided. Arg: output_fpath (str): File location of the generated ONNX file. input_fpath (str): Input file location of the generated ONNX file. network_metadata (NetworkMetadata): Network metadata of the network being converted. Returns: TRTEngineFile: Newly generated engine. """ result = self.trt_engine_class(output_fpath, network_metadata) self.trt_inference_config = CreateConfig( fp16=network_metadata.precision.fp16, max_workspace_size=result.DEFAULT_TRT_WORKSPACE_MB * 1024 * 1024, profiles=result.get_dynamic_shape_profiles(), strict_types=result.use_strict_types()) g_logger_verbosity = (PG_LOGGER.EXTRA_VERBOSE if G_LOGGER.level == G_LOGGER.DEBUG else PG_LOGGER.WARNING) with PG_LOGGER.verbosity(g_logger_verbosity): network_definition = result.get_network_definition( network_from_onnx_path(input_fpath)) trt_engine = engine_from_network(network_definition, config=self.trt_inference_config) save_engine(trt_engine, output_fpath) return result
def test_context(self): model = ONNX_MODELS["identity"] engine = engine_from_network(NetworkFromOnnxBytes(model.loader)) with engine, TrtRunner(engine.create_execution_context) as runner: model.check_runner(runner) assert not runner.owns_engine assert runner.owns_context
def test_calibrator_device_buffers_multiinput(self, multi_input_builder_network, mode): def generate_dev_data(num_batches): with cuda.DeviceArray(shape=(1, ), dtype=np.float32) as x: for _ in range(num_batches): x.copy_from(np.ones((1, ), dtype=np.float32)) xdata = { "array": x, "view": cuda.DeviceView(x.ptr, x.shape, x.dtype), "pointer": x.ptr }[mode] yield { "X0": xdata, "Y0": np.zeros((1, ), dtype=np.float32) } builder, network = multi_input_builder_network NUM_BATCHES = 2 calibrator = Calibrator(generate_dev_data(NUM_BATCHES)) create_config = CreateConfig(int8=True, calibrator=calibrator) with engine_from_network((builder, network), create_config): assert calibrator.num_batches == NUM_BATCHES self.check_calibrator_cleanup(calibrator)
def main(): # In Polygraphy, loaders and runners take ownership of objects if they are provided # via the return values of callables. For example, we don't need to worry about object # lifetimes when we use lazy loaders. # # Since we are immediately evaluating, we take ownership of objects, and are responsible for freeing them. builder, network, parser = network_from_onnx_path("identity.onnx") # Extend the network with an identity layer. prev_output = network.get_output(0) network.unmark_output(prev_output) output = network.add_identity(prev_output).get_output(0) output.name = "output" network.mark_output(output) # Create a TensorRT IBuilderConfig so that we can build the engine with FP16 enabled. config = create_config(builder, network, fp16=True) # We can free everything we constructed above once we're done building the engine. # NOTE: In TensorRT 8.0, we do *not* need to use a context manager here. with builder, network, parser, config: engine = engine_from_network((builder, network), config) # NOTE: In TensorRT 8.0, we do *not* need to use a context manager to free `engine`. with engine, TrtRunner(engine) as runner: inp_data = np.ones((1, 1, 2, 2), dtype=np.float32) # NOTE: The runner owns the output buffers and is free to reuse them between `infer()` calls. # Thus, if you want to store results from multiple inferences, you should use `copy.deepcopy()`. outputs = runner.infer(feed_dict={"x": inp_data}) assert np.array_equal(outputs["output"], inp_data) # It's an identity model! print("Inference succeeded!")
def test_calibrator_with_path_name_cache(self, identity_builder_network): builder, network = identity_builder_network data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}] with tempfile.NamedTemporaryFile() as cache: calibrator = Calibrator(data, cache=cache.name) create_config = CreateConfig(int8=True, calibrator=calibrator) with engine_from_network((builder, network), create_config): check_file_non_empty(cache.name) self.check_calibrator_cleanup(calibrator)
def test_calibrator_generator_data(self, identity_builder_network): builder, network = identity_builder_network NUM_BATCHES = 2 calibrator = Calibrator(generate_data(NUM_BATCHES)) create_config = CreateConfig(int8=True, calibrator=calibrator) with engine_from_network((builder, network), create_config): assert calibrator.num_batches == NUM_BATCHES self.check_calibrator_cleanup(calibrator)
def test_multithreaded_runners_from_engine(self): model = ONNX_MODELS["identity"] engine = engine_from_network(NetworkFromOnnxBytes(model.loader)) with engine, TrtRunner(engine) as runner0, TrtRunner(engine) as runner1: t1 = threading.Thread(target=model.check_runner, args=(runner0,)) t2 = threading.Thread(target=model.check_runner, args=(runner1,)) t1.start() t2.start() t1.join() t2.join()
def test_calibrator_caches_without_explicit_cache( self, identity_builder_network): builder, network = identity_builder_network data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}] calibrator = Calibrator(data) # First, populate the cache create_config = CreateConfig(int8=True, calibrator=calibrator) with engine_from_network((builder, network), create_config): pass # Check that the internal cache is populated assert calibrator.read_calibration_cache() self.check_calibrator_cleanup(calibrator)
def test_calibrator_basic(self, identity_builder_network, BaseClass): if mod.version(trt.__version__) < mod.version( "7.0") and BaseClass == trt.IInt8LegacyCalibrator: pytest.skip("Bug in TRT 6 causes NaNs with legacy calibrator") builder, network = identity_builder_network NUM_BATCHES = 2 data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}] * NUM_BATCHES calibrator = Calibrator(data, BaseClass=BaseClass) create_config = CreateConfig(int8=True, calibrator=calibrator) with engine_from_network((builder, network), create_config): assert calibrator.num_batches == NUM_BATCHES self.check_calibrator_cleanup(calibrator)
def test_calibrator_invalid_input_fails(self, identity_builder_network, names): builder, network = identity_builder_network data = [{ name: np.ones((1, 1, 2, 2), dtype=np.float32) for name in names }] calibrator = Calibrator(data) create_config = CreateConfig(int8=True, calibrator=calibrator) with pytest.raises(PolygraphyException): with engine_from_network((builder, network), create_config): pass
def test_calibrator_rechecks_cache_on_reset(self, identity_builder_network): builder, network = identity_builder_network data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}] with tempfile.NamedTemporaryFile(mode="wb+") as cache: calibrator = Calibrator(data, cache=cache.name) # First, populate the cache create_config = CreateConfig(int8=True, calibrator=calibrator) with engine_from_network((builder, network), create_config): pass # Ensure that now the calibrator will read from the cache when reset calibrator.reset() assert not calibrator.has_cached_scales assert len(calibrator.read_calibration_cache()) == get_file_size( cache.name) self.check_calibrator_cleanup(calibrator)
def test_infer_overhead(self, copy_inputs, copy_outputs): inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32) dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp) out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32) # Using identity model! dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype) stream = cuda.Stream() model = ONNX_MODELS["dynamic_identity"] profiles = [ Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)), ] inp_name = list(model.input_metadata.keys())[0] with engine_from_network( network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles) ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out: # Inference outside the TrtRunner def infer(): if copy_inputs: dev_inp.copy_from(inp, stream=stream) context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr) if copy_outputs: dev_out.copy_to(out, stream=stream) stream.synchronize() native_time = time_func(infer) feed_dict = {inp_name: (inp if copy_inputs else dev_inp)} runner_time = time_func( lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs) ) # The overhead should be less than 0.5ms, or the runtime should be within 5% print("Absolute difference: {:.5g}".format(runner_time - native_time)) print("Relative difference: {:.5g}".format(runner_time / native_time)) assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)
def test_serialize_engine(self, identity_network): with engine_from_network(identity_network) as engine: serialized_engine = bytes_from_engine(engine) assert isinstance(serialized_engine, bytes)
def test_shape_output(self): model = ONNX_MODELS["reshape"] engine = engine_from_network(NetworkFromOnnxBytes(model.loader)) with engine, TrtRunner(engine.create_execution_context) as runner: model.check_runner(runner)
def main(): # A Profile maps each input tensor to a range of shapes. # # TIP: To save lines, calls to `add` can be chained: # profile.add("input0", ...).add("input1", ...) # # Of course, you may alternatively write this as: # profile.add("input0", ...) # profile.add("input1", ...) # profiles = [ # The low-latency case. For best performance, min == opt == max. Profile().add("X", min=(1, 3, 28, 28), opt=(1, 3, 28, 28), max=(1, 3, 28, 28)), # The dynamic batching case. We use `4` for the opt batch size since that's our most common case. Profile().add("X", min=(1, 3, 28, 28), opt=(4, 3, 28, 28), max=(32, 3, 28, 28)), # The offline case. For best performance, min == opt == max. Profile().add("X", min=(128, 3, 28, 28), opt=(128, 3, 28, 28), max=(128, 3, 28, 28)), ] # See examples/api/06_immediate_eval_api for details on immediately evaluated functional loaders like `engine_from_network`. engine = engine_from_network(NetworkFromOnnxPath("dynamic_identity.onnx"), config=CreateConfig(profiles=profiles)) # We'll save the engine so that we can inspect it with `inspect model`. # This should make it easy to see how the engine bindings are laid out. save_engine(engine, "dynamic_identity.engine") # We'll create, but not activate, three separate runners, each with a separate context. # # TIP: By providing a context directly, as opposed to via a lazy loader, # we can ensure that the runner will *not* take ownership of it. # low_latency = TrtRunner(engine.create_execution_context()) # NOTE: The following two lines will cause TensorRT to display errors since profile 0 # is already in use by the first execution context. We'll suppress them using G_LOGGER.verbosity(). # with G_LOGGER.verbosity(G_LOGGER.CRITICAL): dynamic_batching = TrtRunner(engine.create_execution_context()) offline = TrtRunner(engine.create_execution_context()) # NOTE: We could update the profile index here (e.g. `context.active_optimization_profile = 2`), # but instead, we'll use TrtRunner's `set_profile()` API when we later activate the runner. # Finally, we can activate the runners as we need them. # # NOTE: Since the context and engine are already created, the runner will only need to # allocate input and output buffers during activation. input_img = np.ones((1, 3, 28, 28), dtype=np.float32) # An input "image" with low_latency: outputs = low_latency.infer({"X": input_img}) assert np.array_equal(outputs["Y"], input_img) # It's an identity model! print("Low latency runner succeeded!") # While we're serving requests online, we might decide that we need dynamic batching # for a moment. # # NOTE: We're assuming that activating runners will be cheap here, so we can bring up # the dynamic batching runner just-in-time. # # TIP: If activating the runner is not cheap (e.g. input/output buffers are large), # it might be better to keep the runner active the whole time. # with dynamic_batching: # NOTE: The very first time we activate this runner, we need to set # the profile index (it's 0 by default). We need to do this *only once*. # Alternatively, we could have set the profile index in the context directly (see above). # dynamic_batching.set_profile( 1 ) # Use the second profile, which is intended for dynamic batching. # We'll create fake batches by repeating our fake input image. small_input_batch = np.repeat(input_img, 4, axis=0) # Shape: (4, 3, 28, 28) outputs = dynamic_batching.infer({"X": small_input_batch}) assert np.array_equal(outputs["Y"], small_input_batch) # If we need dynamic batching again later, we can activate the runner once more. # # NOTE: This time, we do *not* need to set the profile. # with dynamic_batching: # NOTE: We can use any shape that's in the range of the profile without # additional setup - Polygraphy handles the details behind the scenes! # large_input_batch = np.repeat(input_img, 16, axis=0) # Shape: (16, 3, 28, 28) outputs = dynamic_batching.infer({"X": large_input_batch}) assert np.array_equal(outputs["Y"], large_input_batch) print("Dynamic batching runner succeeded!") with offline: # NOTE: We must set the profile to something other than 0 or 1 since both of those # are now in use by the `low_latency` and `dynamic_batching` runners respectively. # offline.set_profile( 2 ) # Use the third profile, which is intended for the offline case. large_offline_batch = np.repeat(input_img, 128, axis=0) # Shape: (128, 3, 28, 28) outputs = offline.infer({"X": large_offline_batch}) assert np.array_equal(outputs["Y"], large_offline_batch) print("Offline runner succeeded!")