def testGetProviders(self): self.assertTrue('CPUExecutionProvider' in onnxrt.get_available_providers()) # get_all_providers() returns the default EP order from highest to lowest. # CPUExecutionProvider should always be last. self.assertTrue('CPUExecutionProvider' == onnxrt.get_all_providers()[-1]) sess = onnxrt.InferenceSession(get_name("mul_1.onnx")) self.assertTrue('CPUExecutionProvider' in sess.get_providers())
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}" # Few properties than might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 # Load the model as a graph and prepare the CPU backend return InferenceSession(model_path, options, providers=[provider])
def create_onnx_session(self, onnx_model_path, provider='CPUExecutionProvider'): """ Creates ONNX inference session from provided onnx_model_path """ from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 0 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(onnx_model_path, options, providers=[provider]) session.disable_fallback() #if 'OMP_NUM_THREADS' not in os.environ or 'OMP_WAIT_POLICY' not in os.environ: #warnings.warn('''We recommend adding the following at top of script for CPU inference: #from psutil import cpu_count ##Constants from the performance optimization available in onnxruntime ##It needs to be done before importing onnxruntime #os.environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True)) #os.environ["OMP_WAIT_POLICY"] = 'ACTIVE' #''') return session
def __init__( self, tag: t.Union[str, Tag], backend: str, gpu_device_id: int, disable_copy_in_default_stream: bool, providers: t.Optional["_ProviderType"], session_options: t.Optional["ort.SessionOptions"], # type: ignore name: t.Optional[str] = None, ): super().__init__(tag, name=name) self._backend = backend if backend not in SUPPORTED_ONNX_BACKEND: raise BentoMLException( f"'{backend}' runtime is currently not supported for ONNXModel" ) if providers is not None: if not all(i in ort.get_all_providers() for i in flatten_list(providers)): raise BentoMLException( f"'{providers}' cannot be parsed by `onnxruntime`" ) else: providers = self._get_default_providers( gpu_device_id, disable_copy_in_default_stream, ) self._providers = providers self._session_options = session_options
def load( tag: t.Union[str, Tag], backend: t.Optional[str] = "onnxruntime", providers: t.Optional[t.Union["_ProviderType", "_GPUProviderType"]] = None, session_options: t.Optional["ort.SessionOptions"] = None, # type: ignore model_store: "ModelStore" = Provide[BentoMLContainer.model_store], ) -> "ort.InferenceSession": """ Load a model from BentoML local modelstore with given name. Args: tag (:code:`Union[str, Tag]`): Tag of a saved model in BentoML local modelstore. backend (:code:`str`, `optional`, default to :code:`onnxruntime`): Different backend runtime supported by ONNX. Currently only accepted :obj:`onnxruntime` and :obj:`onnxruntime-gpu`. providers (`List[Union[str, Tuple[str, Dict[str, Any]]`, `optional`, default to :code:`None`): Different providers provided by users. By default BentoML will use :func:`onnxruntime.get_available_providers` when loading a model. session_options (`onnxruntime.SessionOptions`, `optional`, default to :code:`None`): SessionOptions per use case. If not specified, then default to :code:`None`. model_store (:mod:`~bentoml._internal.models.store.ModelStore`, default to :mod:`BentoMLContainer.model_store`): BentoML modelstore, provided by DI Container. Returns: :obj:`onnxruntime.InferenceSession`: an instance of ONNX model from BentoML modelstore. Examples: .. code-block:: python import bentoml model = bentoml.onnx.load(tag) """ # noqa model = model_store.get(tag) if model.info.module not in (MODULE_NAME, __name__): raise BentoMLException( f"Model {tag} was saved with module {model.info.module}, failed loading with {MODULE_NAME}." ) model_file = model.path_of(f"{SAVE_NAMESPACE}{ONNX_EXT}") if backend not in SUPPORTED_ONNX_BACKEND: raise BentoMLException( f"'{backend}' runtime is currently not supported for ONNXModel" ) if providers: if not all(i in ort.get_all_providers() for i in flatten_list(providers)): raise BentoMLException(f"'{providers}' cannot be parsed by `onnxruntime`") else: providers = ort.get_available_providers() return ort.InferenceSession( model_file, sess_options=session_options, providers=providers, )
def create_onnx_session(onnx_model_path): provider = 'CPUExecutionProvider' from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" options = SessionOptions() options.intra_op_num_threads = 0 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL session = InferenceSession(onnx_model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider( model_path: str, provider: str = 'CPUExecutionProvider') -> InferenceSession: assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = int(os.environ.get('NUM_THREADS', 4)) options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider(self): assert self.provider in get_all_providers(), f"provider {self.provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(self.model_path, options, providers=[self.provider]) session.disable_fallback() return session
def test_bind_input_types(self): opset = onnx_opset_version() devices = [(C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0), ['CPUExecutionProvider'])] if "CUDAExecutionProvider" in onnxrt.get_all_providers(): devices.append((C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0), ['CUDAExecutionProvider'])) for device, provider in devices: for dtype in [np.float32, np.float64, np.int32, np.uint32, np.int64, np.uint64, np.int16, np.uint16, np.int8, np.uint8, np.float16, np.bool_]: with self.subTest(dtype=dtype, device=str(device)): x = np.arange(8).reshape((-1, 2)).astype(dtype) proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype] X = helper.make_tensor_value_info('X', proto_dtype, [None, x.shape[1]]) Y = helper.make_tensor_value_info('Y', proto_dtype, [None, x.shape[1]]) # inference node_add = helper.make_node('Identity', ['X'], ['Y']) # graph graph_def = helper.make_graph([node_add], 'lr', [X], [Y], []) model_def = helper.make_model( graph_def, producer_name='dummy', ir_version=7, producer_version="0", opset_imports=[helper.make_operatorsetid('', opset)]) sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider) bind = SessionIOBinding(sess._sess) ort_value = C_OrtValue.ortvalue_from_numpy(x, device) bind.bind_ortvalue_input('X', ort_value) bind.bind_output('Y', device) sess._sess.run_with_iobinding(bind, None) ortvalue = bind.get_outputs()[0] y = ortvalue.numpy() assert_almost_equal(x, y) bind = SessionIOBinding(sess._sess) bind.bind_input('X', device, dtype, x.shape, ort_value.data_ptr()) bind.bind_output('Y', device) sess._sess.run_with_iobinding(bind, None) ortvalue = bind.get_outputs()[0] y = ortvalue.numpy() assert_almost_equal(x, y)
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: """ 这里解释一下ExecutionProvider,ONNXRuntime用Provider表示不同的运行设备比如CUDAProvider等。 目前ONNX Runtime v1.0支持了包括CPU,CUDA,TensorRT,MKL等七种Providers。 :param model_path: :param provider: :return: """ assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider(model_path: str, provider: str, optimization_level: str) -> InferenceSession: assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 if optimization_level in GRAPH_OPTIMIZATIONS: options.graph_optimization_level = GRAPH_OPTIMIZATIONS[ optimization_level] else: raise KeyError( f"Unknown Optimization Level {optimization_level} (Available optimization level are all/disable_all/basic/extended)" ) # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
import onnxruntime parser = argparse.ArgumentParser() parser.add_argument('-m', type=str, required=False, default='resnet18-v2-7.onnx', help='ONNX model file name') parser.add_argument('-d', type=str, required=False, default='CPU_FP32', help='OpenVINO device name') args = parser.parse_args() print('model: ', args.m) print('device: ', args.d) available_devices = ['CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16', 'VAD-F_FP16'] if not args.d in available_devices: print('Device must be one of followings : ', available_devices) sys.exit(0) print(onnxruntime.get_all_providers()) print(onnxruntime.get_device()) label = open('synset_words.txt').readlines() # Available device names: CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16, VAD-F_FP32 # VAD == Vision Accelerator Design == HDDL options = onnxruntime.SessionOptions() options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL onnxruntime.capi._pybind_state.set_openvino_device(args.d) sess = onnxruntime.InferenceSession(args.m, options) input_name = sess.get_inputs()[0].name print("Input name :", input_name) input_shape = sess.get_inputs()[0].shape
def testGetProviders(self): self.assertTrue( 'CPUExecutionProvider' in onnxrt.get_available_providers()) self.assertTrue('CPUExecutionProvider' in onnxrt.get_all_providers()) sess = onnxrt.InferenceSession(self.get_name("mul_1.onnx")) self.assertTrue('CPUExecutionProvider' in sess.get_providers())
def framework_info() -> FrameworkInfo: """ Detect the information for the onnx/onnxruntime framework such as package versions, availability for core actions such as training and inference, sparsification support, and inference provider support. :return: The framework info for onnx/onnxruntime :rtype: FrameworkInfo """ all_providers = [] available_providers = [] if check_onnxruntime_install(raise_on_error=False): from onnxruntime import get_all_providers, get_available_providers available_providers = get_available_providers() all_providers = get_all_providers() cpu_provider = FrameworkInferenceProviderInfo( name="cpu", description="Base CPU provider within ONNXRuntime", device="cpu", supported_sparsification=SparsificationInfo(), # TODO: fill in when available available=( check_onnx_install(raise_on_error=False) and check_onnxruntime_install(raise_on_error=False) and "CPUExecutionProvider" in available_providers ), properties={}, warnings=[], ) gpu_provider = FrameworkInferenceProviderInfo( name="cuda", description="Base GPU CUDA provider within ONNXRuntime", device="gpu", supported_sparsification=SparsificationInfo(), # TODO: fill in when available available=( check_onnx_install(raise_on_error=False) and check_onnxruntime_install(raise_on_error=False) and "CUDAExecutionProvider" in available_providers ), properties={}, warnings=[], ) return FrameworkInfo( framework=Framework.onnx, package_versions={ "onnx": get_version(package_name="onnx", raise_on_error=False), "onnxruntime": ( get_version(package_name="onnxruntime", raise_on_error=False) ), "sparsezoo": get_version( package_name="sparsezoo", raise_on_error=False, alternate_package_names=["sparsezoo-nightly"], ), "sparseml": get_version( package_name="sparseml", raise_on_error=False, alternate_package_names=["sparseml-nightly"], ), }, sparsification=sparsification_info(), inference_providers=[cpu_provider, gpu_provider], properties={ "available_providers": available_providers, "all_providers": all_providers, }, training_available=False, sparsification_available=True, exporting_onnx_available=True, inference_available=True, )
def latency(model, law='normal', size=1, number=10, repeat=10, max_time=0, runtime="onnxruntime", device='cpu', profiling=None): """ Measures the latency of a model (python API). :param model: ONNX graph :param law: random law used to generate fake inputs :param size: batch size, it replaces the first dimension of every input if it is left unknown :param number: number of calls to measure :param repeat: number of times to repeat the experiment :param max_time: if it is > 0, it runs as many time during that period of time :param runtime: available runtime :param device: device, `cpu`, `cuda:0` :param profiling: if True, profile the execution of every node, if can be sorted by name or type, the value for this parameter should e in `(None, 'name', 'type')`, :return: dictionary or a tuple (dictionary, dataframe) if the profiling is enable .. cmdref:: :title: Measures model latency :cmd: -m mlprodict latency --help :lid: l-cmd-latency The command generates random inputs and call many times the model on these inputs. It returns the processing time for one iteration. Example:: python -m mlprodict latency --model "model.onnx" """ if isinstance(model, str) and not os.path.exists(model): raise FileNotFoundError( # pragma: no cover "Unable to find model %r." % model) if profiling not in (None, '', 'name', 'type'): raise ValueError("Unexpected value for profiling: %r." % profiling) size = int(size) number = int(number) repeat = int(repeat) if max_time in (None, 0, ""): max_time = None else: max_time = float(max_time) if max_time <= 0: max_time = None if law != "normal": raise ValueError("Only law='normal' is supported, not %r." % law) if device in ('cpu', 'CPUExecutionProviders'): providers = ['CPUExecutionProviders'] elif device in ('cuda:0', 'CUDAExecutionProviders'): if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = ['CUDAExecutionProviders'] elif ',' in device: if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = device.split(',') allp = set(get_all_providers()) for p in providers: if p not in allp: raise ValueError( "One device or provider %r is not supported among %r." "" % (p, allp)) else: raise ValueError( # pragma no cover "Device %r not supported." % device) if runtime == "onnxruntime": if profiling in ('name', 'type'): so = SessionOptions() so.enable_profiling = True sess = InferenceSession(model, sess_options=so) else: sess = InferenceSession(model) fct = lambda feeds: sess.run(None, feeds) inputs = sess.get_inputs() else: if profiling in ('name', 'type'): runtime_options = {"enable_profiling": True} if runtime != 'onnxruntime1': raise NotImplementedError( # pragma: no cover "Profiling is not implemented for runtime=%r." % runtime) else: runtime_options = None oinf = OnnxInference(model, runtime=runtime, runtime_options=runtime_options) fct = lambda feeds: oinf.run(feeds) inputs = oinf.obj.graph.input feeds = random_feed(inputs, size) res = measure_time(lambda: fct(feeds), number=number, repeat=repeat, context={}, max_time=max_time, div_by_number=True) for k, v in feeds.items(): res["shape(%s)" % k] = "x".join(map(str, v.shape)) if profiling in ('name', 'type'): if runtime == 'onnxruntime': profile_name = sess.end_profiling() with open(profile_name, 'r', encoding='utf-8') as f: js = json.load(f) js = OnnxWholeSession.process_profiling(js) df = DataFrame(js) else: df = oinf.get_profiling(as_df=True) if profiling == 'name': gr = df[['dur', "args_op_name", "name"]].groupby(["args_op_name", "name"]).sum().sort_values('dur') else: gr = df[['dur', "args_op_name" ]].groupby("args_op_name").sum().sort_values('dur') return res, gr return res