def __init__(self, model_path, tokenizer_path, tag_predict_model): options = SessionOptions() options.execution_mode = ExecutionMode.ORT_SEQUENTIAL self.model = ort.InferenceSession(model_path, options) self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) self.tag_model = tag_predict_model
def convert_to_onnx(self, onnx_output_dir=None, set_onnx_arg=True): """Convert the model to ONNX format and save to output_dir Args: onnx_output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None. set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True. """ # noqa if not onnx_output_dir: onnx_output_dir = os.path.join(self.options.output_dir, self.options.model_type, self.options.model_name, "onnx") os.makedirs(onnx_output_dir, exist_ok=True) if not os.listdir(onnx_output_dir): onnx_model_name = os.path.join(onnx_output_dir, "onnx_model.onnx") with tempfile.TemporaryDirectory() as temp_dir: basedir = os.path.basename(temp_dir) temp_dir = os.path.join(self.options.output_dir, basedir) self.save_model(output_dir=temp_dir, model=self.model) convert( framework="pt", model=temp_dir, tokenizer=self.tokenizer, output=Path(onnx_model_name), pipeline_name="ner", opset=11, ) self.tokenizer.save_pretrained(onnx_output_dir) self.config.save_pretrained(onnx_output_dir) onnx_options = SessionOptions() use_cuda = True if self._device.type != 'cpu' else False onnx_execution_provider = "CUDAExecutionProvider" if use_cuda else "CPUExecutionProvider" onnx_options.intra_op_num_threads = 1 onnx_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL onnx_model_path = os.path.join(onnx_output_dir, "onnx_model.onnx") if self.options.dynamic_quantize: # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename( Path(onnx_model_path), "-quantized") quantize_dynamic(Path(onnx_model_path), quantized_model_path) onnx_model_path = quantized_model_path.as_posix() return InferenceSession(onnx_model_path, onnx_options, providers=[onnx_execution_provider])
args = parser.parse_args() return args if __name__ == "__main__": args = get_args_from_command_line() start = time.time() tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model) tokens = tokenizer.encode_plus(args.text) tokens = {name: np.atleast_2d(value) for name, value in tokens.items()} end_tokenizer = time.time() print("Tokenization time: ", end_tokenizer - start) start_session = time.time() options= SessionOptions() options.intra_op_num_threads=1 options.execution_mode = ExecutionMode.ORT_SEQUENTIAL #options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL session = InferenceSession(args.onnx_model_path, options) end_session = time.time() print("Setting up session time: ", end_session - start_session) start_predict = time.time() output, = session.run(None, tokens) end_predict = time.time() print("Predict time: ", end_predict - start_predict) print("Overall time: ", end_predict - start) print(output)
def get_onnx_runtime_sessions( model_paths, default: bool = True, opt_level: int = 99, parallel_exe_mode: bool = True, n_threads: int = 4, provider=[ 'CPUExecutionProvider', ], ) -> InferenceSession: ''' Optimizes the model Args: path_to_encoder (str) : the path of input onnx encoder model. path_to_decoder (str) : the path of input onnx decoder model. path_to_initial_decoder (str) : the path of input initial onnx decoder model. opt_level (int) : sess_options.GraphOptimizationLevel param if set 1 uses 'ORT_ENABLE_BASIC', 2 for 'ORT_ENABLE_EXTENDED' and 99 for 'ORT_ENABLE_ALL', default value is set to 99. parallel_exe_mode (bool) : Sets the execution mode. Default is parallel. n_threads (int) : Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose provider : execution providers list. default : set this to true, ort will choose the best settings for your hardware. (you can test out different settings for better results.) Returns: encoder_session : encoder onnx InferenceSession decoder_session : decoder onnx InferenceSession decoder_sess_init : initial decoder onnx InferenceSession ''' path_to_encoder, path_to_decoder, path_to_initial_decoder = model_paths if default: encoder_sess = InferenceSession(str(path_to_encoder)) decoder_sess = InferenceSession(str(path_to_decoder)) decoder_sess_init = InferenceSession(str(path_to_initial_decoder)) else: # Few properties that might have an impact on performances options = SessionOptions() if opt_level == 1: options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC elif opt_level == 2: options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_EXTENDED else: assert opt_level == 99 options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL # set this true for better performance if parallel_exe_mode == True: options.execution_mode = ExecutionMode.ORT_PARALLEL else: options.execution_mode = ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = n_threads # options.inter_op_num_threads = 10 # options.enable_profiling = True encoder_sess = InferenceSession(str(path_to_encoder), options, providers=provider) decoder_sess = InferenceSession(str(path_to_decoder), options, providers=provider) decoder_sess_init = InferenceSession(str(path_to_initial_decoder), options, providers=provider) return encoder_sess, decoder_sess, decoder_sess_init