def infer(self, feed_dict, output): for name in self.engine: if name in feed_dict: in_out = [feed_dict[name]] elif isinstance(output, tuple): in_out = [ output[i].detach().cpu().numpy() for i in range(len(output)) ] else: in_out = [output.detach().cpu().numpy()] binding = self.engine[name] # Only set shapes if required for i in range(len(in_out)): shape = in_out[i].shape if self.engine.is_shape_binding(binding) and is_shape_dynamic( self.context.get_shape(binding)): logging.debug( "Setting shape binding: {:} (index: {:}) to: {:}". format(name, binding, in_out[i])) self.context.set_shape_input(binding, in_out[i]) elif is_shape_dynamic(self.context.get_binding_shape(binding)): logging.debug( "Setting binding: {:} (index: {:}) to shape: {:}". format(name, binding, shape)) self.context.set_binding_shape(binding, shape) # Check if not self.context.all_binding_shapes_specified: logging.critical( "Some input shapes were not specified.\nNote: Inputs are: {:}". format(self.get_input_metadata())) if not self.context.all_shape_inputs_specified: logging.critical( "Some shape inputs were not specified.\nNote: Inputs are: {:}". format(self.get_input_metadata())) bindings_per_profile = self.engine.num_bindings // self.engine.num_optimization_profiles start_binding = self.context.active_optimization_profile * bindings_per_profile end_binding = start_binding + bindings_per_profile # Resize buffers so they are the appropriate size. for binding in range(start_binding, end_binding): shape = tuple(self.context.get_binding_shape(binding)) self.buffers.resize(self.engine[binding], shape) bindings = self.buffers.get_bindings() start = time.perf_counter() self.buffers.copy_inputs(feed_dict, self.stream) self.context.execute_async_v2(bindings=bindings, stream_handle=self.stream.handle) self.buffers.copy_outputs(self.stream) self.stream.synchronize() end = time.perf_counter() self.inference_time = end - start return self.buffers.get_outputs()
def __call__(self): class DummyContextManager(object): def __enter__(self): return None def __exit__(self, exc_type, exc_value, traceback): return None network_parser = self.network_loader() try: network, parser = network_parser assert isinstance(network, trt.INetworkDefinition) except (ValueError, AssertionError): network = network_parser parser = DummyContextManager() with trt.Builder(TRT_LOGGER) as builder, network, parser: if self.preprocess_network: logging.debug("Applying network preprocessing: {:}".format( self.preprocess_network)) self.preprocess_network(network) if self.layerwise: TensorRTRunnerV2.mark_layerwise(network) if logging.getEffectiveLevel() <= logging.DEBUG: TensorRTRunnerV2.log_network(network) config = builder.create_builder_config() profile = TensorRTRunnerV2.build_profile(builder, network, self.profile_shapes) config.add_optimization_profile(profile) config.max_workspace_size = int(self.max_workspace_size) if self.fp16_mode: config.flags = 1 << int(trt.BuilderFlag.FP16) if self.int8_mode: config.flags = config.flags | 1 << int(trt.BuilderFlag.INT8) if not network.has_explicit_precision: if not self.calibrator: logging.critical( "Network does not have explicit precision. A calibrator must be provided in order to use int8 mode." ) self.calibrator.set_input_metadata( get_input_metadata_from_profile(profile, network)) config.int8_calibrator = self.calibrator logging.debug("Using builder configuration flags: {:}".format( config.flags)) logging.info( "Building engine: max workspace size={:} bytes, fp16={:}, int8={:}, layerwise={:}" .format(self.max_workspace_size, self.fp16_mode, self.int8_mode, self.layerwise)) engine = builder.build_engine(network, config) self.written_engine_path = write_timestamped( contents=lambda: engine.serialize(), dir=self.write_engine, name="tensorrt_runner_v2.engine") return engine
def get_profile_shape(name): if name not in profile_shapes: return None shapes = profile_shapes[name] if not isinstance(shapes, list) or len(shapes) != 3: logging.critical( "Profile values must be a list containing exactly 3 shapes (tuples or Dims), but received shapes: {:} for input: {:}.\nNote: profile was: {:}.\nNote: Network inputs were: {:}" .format(shapes, name, profile_shapes, TensorRTRunnerV2.get_network_inputs(network))) return shapes
def __call__(self): network = TensorRTRunnerV2.create_network( explicit_precision=self.explicit_precision) parser = trt.OnnxParser(network, TRT_LOGGER) success = parser.parse(self.onnx_loader().SerializeToString()) if not success: for index in range(parser.num_errors): logging.error(parser.get_error(index)) logging.critical("Could not parse ONNX correctly") return network, parser
def create_network(explicit_batch=True, explicit_precision=False): with trt.Builder(TRT_LOGGER) as builder: network_flags = 0 if explicit_batch: network_flags = 1 << int( trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) if explicit_precision: network_flags = network_flags | (1 << int( trt.NetworkDefinitionCreationFlag.EXPLICIT_PRECISION)) network = builder.create_network(flags=network_flags) if network is None: logging.critical("Invalid network") return network
def __init__(self, model_loader=None, plugins=None, name=None): """ Creates a runner that manages a single TensorRT engine. Args: model_loader (Callable() -> trt.ICudaEngine): A callable that can supply a TensorRT engine. Optional Args: max_workspace_size (int): The maximum workspace size in bytes. plugins (List[str]): A list of paths to plugin libraries to load before inference. name (str): The human-readable name to use for this runner. """ set_trt_logging_level(logging.getEffectiveLevel()) def load_plugins(): import ctypes for plugin in plugins: path = os.path.abspath(plugin) logging.info("Loading plugin library: {:}".format(path)) ctypes.CDLL(path) # Load any user-supplied plugin libraries. This must happen before everything else, including engine deserialization. if plugins: load_plugins() # Choose a unique name for this runner. super().__init__( default_value( name, "trt-v2-runner-{:}".format(TensorRTRunnerV2.total_runners))) TensorRTRunnerV2.total_runners += 1 logging.debug("Creating {:}".format(self.name)) self.model_loader = model_loader self.engine = self.model_loader() if not self.engine: logging.critical( "Invalid Engine. Please ensure the engine was built correctly." ) self.buffers = Buffers.from_engine(self.engine) self.stream = cuda.Stream() self.context = self.engine.create_execution_context()
def __test_export_route(self, module, out_name, mode, input_example=None): # select correct extension based on the output format ext = { DF.ONNX: ".onnx", DF.TRTONNX: ".trt.onnx", DF.PYTORCH: ".pt", DF.TORCHSCRIPT: ".ts" }.get(mode, ".onnx") out = Path(f"{out_name}{ext}") out_name = str(out) if out.exists(): os.remove(out) module.eval() outputs_fwd = (module.forward(*tuple(input_example.values())) if isinstance(input_example, OrderedDict) else (module.forward( *input_example) if isinstance(input_example, tuple) else module.forward(input_example) if input_example is not None else None)) deploy_input_example = (tuple(input_example.values()) if isinstance( input_example, OrderedDict) else input_example) self.nf.deployment_export( module=module, output=out_name, input_example=deploy_input_example, d_format=mode, output_example=outputs_fwd, ) tol = 5.0e-3 assert out.exists() == True if mode == DF.TRTONNX: data_loader = DefaultDataLoader() loader_cache = DataLoaderCache(data_loader) profile_shapes = OrderedDict() names = list(module.input_ports) + list(module.output_ports) names = list( filter( lambda x: x not in (module._disabled_deployment_input_ports | module. _disabled_deployment_output_ports), names, )) if isinstance(input_example, tuple): si = [ tuple(input_example[i].shape) for i in range(len(input_example)) ] elif isinstance(input_example, OrderedDict): si = [ tuple(input_example.values())[i].shape for i in range(len(input_example)) ] else: si = [tuple(input_example.shape)] if isinstance(outputs_fwd, tuple): fi = [ tuple(outputs_fwd[i].shape) for i in range(len(outputs_fwd)) ] else: fi = [tuple(outputs_fwd.shape)] si = si + fi i = 0 for name in names: profile_shapes[name] = [si[i]] * 3 i = i + 1 onnx_loader = OnnxFileLoader(out_name) network_loader = OnnxNetworkLoader(onnx_loader, explicit_precision=False) model_loader = BuildEngineLoader( network_loader, max_workspace_size=1 << 30, fp16_mode=False, int8_mode=False, profile_shapes=profile_shapes, write_engine=None, calibrator=None, layerwise=False, ) with TensorRTRunnerV2(model_loader=model_loader) as active_runner: input_metadata = active_runner.get_input_metadata() if input_metadata is None: logging.critical( "For {:}, get_input_metadata() returned None!".format( active_runner.name)) logging.debug("Runner Inputs: {:}".format(input_metadata)) feed_dict = loader_cache.load(iteration=0, input_metadata=input_metadata, input_example=input_example) inputs = dict() input_names = list(input_metadata.keys()) for i in range(len(input_names)): input_name = input_names[i] if input_name in module._disabled_deployment_input_ports: continue inputs[input_name] = ( input_example[input_name].cpu().numpy() if isinstance( input_example, OrderedDict) else (input_example[i].cpu().numpy() if isinstance( input_example, tuple) else input_example.cpu().numpy())) out_dict = active_runner.infer(feed_dict=feed_dict, output=outputs_fwd) for ov in out_dict.values(): outputs_scr = torch.from_numpy(ov).cuda() break outputs = [] outputs.append(copy.deepcopy(out_dict)) logging.debug("Received outputs: {:}".format([ "{:}: {:}".format(name, out.shape) for name, out in out_dict.items() ])) logging.info("Output Buffers: {:}".format(outputs)) inpex = [] for ie in feed_dict.values(): # loader_cache.cache[0].values(): if ie.dtype.type is np.int32: inpex.append(torch.from_numpy(ie).long().cuda()) else: inpex.append(torch.from_numpy(ie).cuda()) if len(inpex) == len(input_example): break inpex = tuple(inpex) outputs_fwd = module.forward(*inpex) elif mode == DF.ONNX: # Must recompute because *module* might be different now outputs_fwd = ( module.forward(*tuple(input_example.values())) if isinstance( input_example, OrderedDict) else (module.forward(*input_example) if isinstance( input_example, tuple) else module.forward(input_example))) sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC ort_session = ort.InferenceSession(out_name, sess_options, ['CUDAExecutionProvider']) print('Execution Providers: ', ort_session.get_providers()) inputs = dict() input_names = list(module.input_ports) ort_inputs = ort_session.get_inputs() for i in range(len(input_names)): input_name = input_names[i] if input_name in module._disabled_deployment_input_ports: continue inputs[input_name] = (input_example[input_name].cpu().numpy() if isinstance(input_example, OrderedDict) else (input_example[i].cpu().numpy() if isinstance(input_example, tuple) else input_example.cpu().numpy())) outputs_scr = ort_session.run(None, inputs) outputs_scr = torch.from_numpy(outputs_scr[0]).cuda() elif mode == DF.TORCHSCRIPT: scr = torch.jit.load(out_name) if isinstance(module, nemo.backends.pytorch.tutorials.TaylorNet): input_example = torch.randn(4, 1).cuda() outputs_fwd = module.forward(input_example) outputs_scr = ( module.forward(*tuple(input_example.values())) if isinstance( input_example, OrderedDict) else (module.forward(*input_example) if isinstance( input_example, tuple) else module.forward(input_example))) elif mode == DF.PYTORCH: module.load_state_dict(torch.load(out_name)) module.eval() outputs_scr = ( module.forward(*tuple(input_example.values())) if isinstance( input_example, OrderedDict) else (module.forward(*input_example) if isinstance( input_example, tuple) else module.forward(input_example))) outputs_scr = (outputs_scr[0] if isinstance(outputs_scr, tuple) or isinstance(outputs_scr, list) else outputs_scr) outputs_fwd = (outputs_fwd[0] if isinstance(outputs_fwd, tuple) or isinstance(outputs_fwd, list) else outputs_fwd) assert (outputs_scr - outputs_fwd).norm(p=2) < tol if out.exists(): os.remove(out)
def build_profile(builder, network, profile_shapes, default_shape_value=DEFAULT_SHAPE_VALUE): def override_shape(shape): return tuple([ DEFAULT_SHAPE_VALUE if is_dimension_dynamic(dim) else dim for dim in shape ]) def get_profile_shape(name): if name not in profile_shapes: return None shapes = profile_shapes[name] if not isinstance(shapes, list) or len(shapes) != 3: logging.critical( "Profile values must be a list containing exactly 3 shapes (tuples or Dims), but received shapes: {:} for input: {:}.\nNote: profile was: {:}.\nNote: Network inputs were: {:}" .format(shapes, name, profile_shapes, TensorRTRunnerV2.get_network_inputs(network))) return shapes profile = builder.create_optimization_profile() for idx in range(network.num_inputs): inp = network.get_input(idx) if inp.is_shape_tensor: shapes = get_profile_shape(inp.name) if not shapes: rank = inp.shape[0] shapes = [(DEFAULT_SHAPE_VALUE, ) * rank] * 3 logging.warning( "Setting shape input to {:}. If this is incorrect, for shape input: {:}, please provide tuples for min, opt, and max shapes containing {:} elements" .format(shapes[0], inp.name, rank), mode=logging_mode.ONCE, ) min, opt, max = shapes profile.set_shape(inp.name, min, opt, max) inp.shape = opt logging.info( "Setting shape input: {:} values to min: {:}, opt: {:}, max: {:}" .format(inp.name, min, opt, max)) else: shapes = get_profile_shape(inp.name) if not shapes: shapes = [override_shape(inp.shape)] * 3 logging.warning( "Overriding input shape {:} to {:}. If this is incorrect, for input tensor: {:}, please provide tuples for min, opt, and max shapes containing values: {:} with dynamic dimensions replaced," .format(inp.shape, shapes[0], inp.name, inp.shape), mode=logging_mode.ONCE, ) min, opt, max = shapes profile.set_shape(inp.name, min, opt, max) inp.shape = opt logging.info( "Setting input: {:} shape to min: {:}, opt: {:}, max: {:}". format(inp.name, min, opt, max)) if not profile: logging.critical( "Profile is not valid, please provide profile data. Note: profile was: {:}" .format(profile_shapes)) return profile
def __call__(self, index, input_metadata, input_example=None): logging.debug("Updating seed to: {:}".format(self.seed + index)) rng = np.random.RandomState(self.seed + index) buffers = OrderedDict() i = 0 for name, (dtype, shape) in input_metadata.items(): if input_example is not None and (not isinstance( input_example, tuple) or i < len(input_example)): if isinstance(input_example, tuple): static_shape = input_example[i].shape elif isinstance(input_example, OrderedDict): static_shape = tuple(input_example.values())[i].shape else: static_shape = [tuple(input_example.shape)] elif is_shape_dynamic(shape): if name in self.default_shapes: static_shape = self.default_shapes[name] else: static_shape = [ self.default_shape_value if is_dimension_dynamic(elem) else elem for elem in shape ] if static_shape != shape: if not is_valid_shape_override(static_shape, shape): logging.critical( "Cannot override original shape: {:}, for input: {:} to {:}" .format(shape, name, static_shape)) logging.warning( "Input: {:}: Adjusted dynamic shape: {:} to: {:}". format(name, shape, static_shape), mode=logging_mode.ONCE, ) else: if name in self.default_shapes: logging.warning( "Will not override static shape: {:}, for input: {:}". format(shape, name), mode=logging_mode.ONCE, ) static_shape = shape if input_example is not None and (not isinstance( input_example, tuple) or i < len(input_example)): if isinstance(input_example, OrderedDict): buffers[name] = list(input_example.values())[i].cpu() else: buffers[name] = input_example[i].cpu() if isinstance( input_example, tuple) else input_example.cpu() elif np.issubdtype(dtype, np.integer): buffers[name] = rng.randint(low=self.int_min, high=self.int_max, size=static_shape, dtype=dtype) elif np.issubdtype(dtype, np.bool_): buffers[name] = rng.randint(low=0, high=2, size=static_shape).astype(dtype) else: buffers[name] = (rng.random_sample(size=static_shape) * (self.float_max - self.float_min) + self.float_min).astype(dtype) buffers[name] = np.array( buffers[name] ) # To handle scalars. The above functions return a float if shape is (). # If the shape is 1D, and has a length equal to the rank of the provided default shape, it is # likely to be a TRT shape tensor, and so should be overriden such that it's value (not shape) is the default shape. is_shape_tensor = ((not is_shape_dynamic(shape)) and (name in self.default_shapes) and (len(shape) == 1) and (shape[0] == len(self.default_shapes[name]))) if is_shape_tensor: buffers[name] = np.array(self.default_shapes[name], dtype=dtype) logging.warning( "Assuming {:} is a shape tensor. Setting to: {:}".format( name, buffers[name]), mode=logging_mode.ONCE, ) i = i + 1 return buffers
def load(self, iteration, input_metadata, input_example=None): """ Load the specified iteration from the cache if present, or generate using the data loader. Args: iteration (int): The iteration whose data to retrieve. input_metadata (OrderedDict[str, Tuple[np.dtype, Tuple[int]]]): Input Metadata, including shape and type information. The loader may attempt to match input_metadata when data in the cache does not exactly match a new set of input_metadata. """ if iteration not in self.cache: logging.debug( "Iteration {:} not found in cache, generating new buffers for all inputs" .format(iteration)) self.cache[iteration] = self.data_loader(iteration, input_metadata, input_example) if self.cache[iteration] is None: logging.critical( "Received no data from data_loader(iteration, input_metadata) for input_metadata: {:}" .format(input_metadata)) else: logging.info("Found iteration {:} in cache".format(iteration)) feed_dict = OrderedDict() for index, (name, (dtype, shape)) in enumerate(input_metadata.items()): cached_name = find_in_dict(name, self.cache[iteration], index) if cached_name is None: logging.warning( "Could not find input: {:} in cache, regenerating buffers". format(name)) self.cache[iteration] = self.data_loader( iteration, input_metadata, input_example) cached_name = name buffer = self.cache[iteration][cached_name] if dtype != buffer.dtype: logging.warning( "Cached buffer data type does not match data type for input: {:}. Note: Cached type: {:}, input type: {:}. Attempting to cast" .format(name, buffer.dtype, dtype)) buffer = buffer.astype(dtype) if not is_valid_shape_override(buffer.shape, shape): logging.warning( "Cached buffer shape does not match shape for input. Note: Cached shape: {:}, input shape: {:}." .format(buffer.shape, shape)) # Try to permute the shape to match try: perm = FormatManager.permutation( FormatManager.deduce_format(buffer.shape), FormatManager.deduce_format(shape)) new_shape = FormatManager.convert( tuple(buffer.shape), FormatManager.deduce_format(shape)) logging.warning( "Attempting to permute shape: {:} using permutation {:}. New shape: {:}" .format(buffer.shape, perm, new_shape)) buffer = np.transpose(buffer, perm) except NotImplementedError as err: # If the FormatManager does not recognize the format, skip permutation. logging.info("Skipping permutation due to {:}".format(err)) except KeyError as err: # If the FormatManager cannot generate the permutation for the format combination, skip permutation. logging.info("Skipping permutation due to {:}".format(err)) feed_dict[name] = buffer return feed_dict