def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, verbose=False): session = None try: from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") elif (not use_gpu) and (version.parse(onnxruntime_version) < version.parse('1.3.0')): # Set intra_op_num_threads = 1 to enable OpenMP for onnxruntime 1.2.0 (cpu) # onnxruntime-gpu is not built with openmp so it is better to use default (0) or cpu_count instead. sess_options.intra_op_num_threads = 1 if verbose: sess_options.log_severity_level = 0 logger.debug(f"Create session for onnx model: {onnx_model_path}") execution_providers = ['CPUExecutionProvider' ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}" # Few properties than might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 # Load the model as a graph and prepare the CPU backend return InferenceSession(model_path, options, providers=[provider])
def create_onnxruntime_session(onnx_model_path, use_gpu, verbose): session = None try: from onnxruntime import SessionOptions, InferenceSession sess_options = SessionOptions() if not use_gpu: sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) logger.debug( f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) if verbose: sess_options.log_severity_level = 0 logger.debug(f"Create session for onnx model: {onnx_model_path}") execution_providers = ['CPUExecutionProvider'] if not use_gpu else [ 'CUDAExecutionProvider', 'CPUExecutionProvider' ] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def create_onnx_session(self, onnx_model_path, provider='CPUExecutionProvider'): """ Creates ONNX inference session from provided onnx_model_path """ from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 0 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(onnx_model_path, options, providers=[provider]) session.disable_fallback() #if 'OMP_NUM_THREADS' not in os.environ or 'OMP_WAIT_POLICY' not in os.environ: #warnings.warn('''We recommend adding the following at top of script for CPU inference: #from psutil import cpu_count ##Constants from the performance optimization available in onnxruntime ##It needs to be done before importing onnxruntime #os.environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True)) #os.environ["OMP_WAIT_POLICY"] = 'ACTIVE' #''') return session
def create_ort_session(onnx_model_path, use_gpu=True): from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL sess_options.intra_op_num_threads = 2 sess_options.log_severity_level = 2 execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] return InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
def create_onnx_session(onnx_model_path): provider = 'CPUExecutionProvider' from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" options = SessionOptions() options.intra_op_num_threads = 0 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL session = InferenceSession(onnx_model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider( model_path: str, provider: str = 'CPUExecutionProvider') -> InferenceSession: assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = int(os.environ.get('NUM_THREADS', 4)) options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider(self): assert self.provider in get_all_providers(), f"provider {self.provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(self.model_path, options, providers=[self.provider]) session.disable_fallback() return session
def convert_to_onnx(self, onnx_output_dir=None, set_onnx_arg=True): """Convert the model to ONNX format and save to output_dir Args: onnx_output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None. set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True. """ # noqa if not onnx_output_dir: onnx_output_dir = os.path.join(self.options.output_dir, self.options.model_type, self.options.model_name, "onnx") os.makedirs(onnx_output_dir, exist_ok=True) if not os.listdir(onnx_output_dir): onnx_model_name = os.path.join(onnx_output_dir, "onnx_model.onnx") with tempfile.TemporaryDirectory() as temp_dir: basedir = os.path.basename(temp_dir) temp_dir = os.path.join(self.options.output_dir, basedir) self.save_model(output_dir=temp_dir, model=self.model) convert( framework="pt", model=temp_dir, tokenizer=self.tokenizer, output=Path(onnx_model_name), pipeline_name="ner", opset=11, ) self.tokenizer.save_pretrained(onnx_output_dir) self.config.save_pretrained(onnx_output_dir) onnx_options = SessionOptions() use_cuda = True if self._device.type != 'cpu' else False onnx_execution_provider = "CUDAExecutionProvider" if use_cuda else "CPUExecutionProvider" onnx_options.intra_op_num_threads = 1 onnx_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL onnx_model_path = os.path.join(onnx_output_dir, "onnx_model.onnx") if self.options.dynamic_quantize: # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename( Path(onnx_model_path), "-quantized") quantize_dynamic(Path(onnx_model_path), quantized_model_path) onnx_model_path = quantized_model_path.as_posix() return InferenceSession(onnx_model_path, onnx_options, providers=[onnx_execution_provider])
def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False, use_dml=False): session = None try: from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") if use_gpu: if use_dml: execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider'] else: execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] else: execution_providers = ['CPUExecutionProvider'] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: """ 这里解释一下ExecutionProvider,ONNXRuntime用Provider表示不同的运行设备比如CUDAProvider等。 目前ONNX Runtime v1.0支持了包括CPU,CUDA,TensorRT,MKL等七种Providers。 :param model_path: :param provider: :return: """ assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
def create_model_for_provider(model_path: str, provider: str, optimization_level: str) -> InferenceSession: assert provider in get_all_providers( ), f"provider {provider} not found, {get_all_providers()}" # Few properties that might have an impact on performances (provided by MS) options = SessionOptions() options.intra_op_num_threads = 1 if optimization_level in GRAPH_OPTIMIZATIONS: options.graph_optimization_level = GRAPH_OPTIMIZATIONS[ optimization_level] else: raise KeyError( f"Unknown Optimization Level {optimization_level} (Available optimization level are all/disable_all/basic/extended)" ) # Load the model as a graph and prepare the CPU backend session = InferenceSession(model_path, options, providers=[provider]) session.disable_fallback() return session
args = parser.parse_args() return args if __name__ == "__main__": args = get_args_from_command_line() start = time.time() tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model) tokens = tokenizer.encode_plus(args.text) tokens = {name: np.atleast_2d(value) for name, value in tokens.items()} end_tokenizer = time.time() print("Tokenization time: ", end_tokenizer - start) start_session = time.time() options= SessionOptions() options.intra_op_num_threads=1 options.execution_mode = ExecutionMode.ORT_SEQUENTIAL #options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL session = InferenceSession(args.onnx_model_path, options) end_session = time.time() print("Setting up session time: ", end_session - start_session) start_predict = time.time() output, = session.run(None, tokens) end_predict = time.time() print("Predict time: ", end_predict - start_predict) print("Overall time: ", end_predict - start) print(output)
def create_onnxruntime_session( onnx_model_path, use_gpu, provider=None, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False, provider_options={}, # map execution provider name to its option ): session = None try: from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug( f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") if use_gpu: if provider == "dml": providers = ["DmlExecutionProvider", "CPUExecutionProvider"] elif provider == "rocm": providers = ["ROCMExecutionProvider", "CPUExecutionProvider"] elif provider == "migraphx": providers = [ "MIGraphXExecutionProvider", "ROCMExecutionProvider", "CPUExecutionProvider", ] elif provider == "cuda": providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] elif provider == "tensorrt": providers = [ "TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider", ] else: providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: providers = ["CPUExecutionProvider"] if provider_options: providers = [ (name, provider_options[name]) if name in provider_options else name for name in providers ] session = InferenceSession(onnx_model_path, sess_options, providers=providers) except: logger.error(f"Exception", exc_info=True) return session
def get_onnx_runtime_sessions( model_paths, default: bool = True, opt_level: int = 99, parallel_exe_mode: bool = True, n_threads: int = 4, provider=[ 'CPUExecutionProvider', ], ) -> InferenceSession: ''' Optimizes the model Args: path_to_encoder (str) : the path of input onnx encoder model. path_to_decoder (str) : the path of input onnx decoder model. path_to_initial_decoder (str) : the path of input initial onnx decoder model. opt_level (int) : sess_options.GraphOptimizationLevel param if set 1 uses 'ORT_ENABLE_BASIC', 2 for 'ORT_ENABLE_EXTENDED' and 99 for 'ORT_ENABLE_ALL', default value is set to 99. parallel_exe_mode (bool) : Sets the execution mode. Default is parallel. n_threads (int) : Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose provider : execution providers list. default : set this to true, ort will choose the best settings for your hardware. (you can test out different settings for better results.) Returns: encoder_session : encoder onnx InferenceSession decoder_session : decoder onnx InferenceSession decoder_sess_init : initial decoder onnx InferenceSession ''' path_to_encoder, path_to_decoder, path_to_initial_decoder = model_paths if default: encoder_sess = InferenceSession(str(path_to_encoder)) decoder_sess = InferenceSession(str(path_to_decoder)) decoder_sess_init = InferenceSession(str(path_to_initial_decoder)) else: # Few properties that might have an impact on performances options = SessionOptions() if opt_level == 1: options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC elif opt_level == 2: options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_EXTENDED else: assert opt_level == 99 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # set this true for better performance if parallel_exe_mode == True: options.execution_mode = ExecutionMode.ORT_PARALLEL else: options.execution_mode = ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = n_threads # options.inter_op_num_threads = 10 # options.enable_profiling = True encoder_sess = InferenceSession(str(path_to_encoder), options, providers=provider) decoder_sess = InferenceSession(str(path_to_decoder), options, providers=provider) decoder_sess_init = InferenceSession(str(path_to_initial_decoder), options, providers=provider) return encoder_sess, decoder_sess, decoder_sess_init