def _build_onnxrt_session(model): # using https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers dummy_input = {'input_ids': torch.ones(1,128, dtype=torch.int64), 'attention_mask': torch.ones(1,128, dtype=torch.int64), 'token_type_ids': torch.ones(1,128, dtype=torch.int64)} symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} onnx_model_path = "/tmp/temp_turbo_onnx.model" onnx_opt_model_path = "/tmp/temp_turbo_onnx_opt.model" quantized_model_path = "/tmp/temp_turbo_onnx_q.model" # (1) export to onnx fp32 model with open(onnx_model_path, 'wb') as f: torch.onnx.export(model, (dummy_input['input_ids'], dummy_input['attention_mask'], dummy_input['token_type_ids']), f, input_names=['input_ids', 'attention_mask', 'token_type_ids'], output_names=['output'], opset_version=11, dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names, 'token_type_ids': symbolic_names}) # (2) optimize the fp32 model from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( onnx_model_path, 'bert', num_heads=model.config.num_attention_heads, hidden_size=model.config.hidden_size, optimization_options=opt_options) opt_model.save_model_to_file(onnx_opt_model_path) # (3) quantize the model from onnxruntime.quantization import quantize, QuantizationMode import onnx import onnxruntime import onnxruntime.backend opt_model = onnx.load(onnx_opt_model_path) quantized_onnx_model = quantize(opt_model, quantization_mode=QuantizationMode.IntegerOps, symmetric_weight=True, force_fusions=True) onnx.save(quantized_onnx_model, quantized_model_path) sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL return onnxruntime.InferenceSession(quantized_model_path, sess_options)
def export_model_to_onnx(self, fpath, quantize=False, target_opset=None, verbose=1): """ Export model to onnx Args: fpath(str): String representing full path to model file where ONNX model will be saved. Example: '/tmp/my_model.onnx' quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 1) ONNX model (created directly using keras2onnx 2) an optimized ONNX model (created by transformers library) 3) a quantized version of optimized ONNX model (created by transformers library) All files will be created in the parent folder of fpath: Example: If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and /tmp/model-optimized-quantized.onnx will also be created. verbose(bool): verbosity Returns: str: string representing fpath. If quantize=True, returned fpath will be different than supplied fpath """ try: import onnxruntime, onnxruntime_tools, onnx, keras2onnx except ImportError: raise Exception('This method requires ONNX libraries to be installed: '+\ 'pip install -q --upgrade onnxruntime==1.5.1 onnxruntime-tools onnx keras2onnx') from pathlib import Path if type(self.preproc).__name__ == 'BERTPreprocessor': raise Exception('currently_unsupported: BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\ 'Only BERT models created with Transformer(...) are supported.') if verbose: print( 'converting to ONNX format ... this may take a few moments...') if U.is_huggingface(model=self.model): tokenizer = self.preproc.get_tokenizer() maxlen = self.preproc.maxlen input_dict = tokenizer('Name', return_tensors='tf', padding='max_length', max_length=maxlen) if version.parse(tf.__version__) < version.parse('2.2'): raise Exception( 'export_model_to_tflite requires tensorflow>=2.2') #self.model._set_inputs(input_spec, training=False) # for tf < 2.2 self.model._saved_model_inputs_spec = None # for tf > 2.2 self.model._set_save_spec(input_dict) # for tf > 2.2 self.model._get_save_spec() onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset) keras2onnx.save_model(onnx_model, fpath) return_fpath = fpath if quantize: from transformers.convert_graph_to_onnx import optimize, quantize #opt_path = optimize(Path(fpath)) if U.is_huggingface(model=self.model) and\ type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']: try: from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( fpath, 'bert', # bert_keras causes error with transformers num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file(fpath) except: warnings.warn('Could not run BERT-specific optimizations') pass quantize_path = quantize(Path(fpath)) return_fpath = quantize_path.as_posix() if verbose: print('done.') return return_fpath
# # An optional step unless # # you want to get a model with mixed precision for perf accelartion on newer GPU # # or you are working with Tensorflow(tf.keras) models or pytorch models other than bert # !pip install onnxruntime-tools # from onnxruntime_tools import optimizer # # Mixed precision conversion for bert-base-cased model converted from Pytorch # optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert', num_heads=12, hidden_size=768) # optimized_model.convert_model_float32_to_float16() # optimized_model.save_model_to_file("bert-base-cased.onnx") # # optimizations for bert-base-cased model converted from Tensorflow(tf.keras) # optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert_keras', num_heads=12, hidden_size=768) # optimized_model.save_model_to_file("bert-base-cased.onnx") # optimize transformer-based models with onnxruntime-tools from onnxruntime_tools import optimizer from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model('onnx/bert-base-cased.onnx', 'bert', num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file('onnx/bert.opt.onnx')