def _build_onnxrt_session(model):
    # using https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers
    dummy_input = {'input_ids':      torch.ones(1,128, dtype=torch.int64),
                   'attention_mask': torch.ones(1,128, dtype=torch.int64),
                   'token_type_ids': torch.ones(1,128, dtype=torch.int64)}
    symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
    onnx_model_path = "/tmp/temp_turbo_onnx.model"
    onnx_opt_model_path = "/tmp/temp_turbo_onnx_opt.model"
    quantized_model_path = "/tmp/temp_turbo_onnx_q.model"
    # (1) export to onnx fp32 model
    with open(onnx_model_path, 'wb') as f:
        torch.onnx.export(model, (dummy_input['input_ids'], dummy_input['attention_mask'], dummy_input['token_type_ids']),
                          f, input_names=['input_ids', 'attention_mask', 'token_type_ids'], output_names=['output'],
                          opset_version=11,
                          dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names, 'token_type_ids': symbolic_names})
    # (2) optimize the fp32 model
    from onnxruntime_tools import optimizer
    from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
    opt_options = BertOptimizationOptions('bert')
    opt_options.enable_embed_layer_norm = False
    opt_model = optimizer.optimize_model(
        onnx_model_path,
        'bert', 
        num_heads=model.config.num_attention_heads,
        hidden_size=model.config.hidden_size,
        optimization_options=opt_options)
    opt_model.save_model_to_file(onnx_opt_model_path)
    # (3) quantize the model
    from onnxruntime.quantization import quantize, QuantizationMode
    import onnx
    import onnxruntime
    import onnxruntime.backend
    opt_model = onnx.load(onnx_opt_model_path)
    quantized_onnx_model = quantize(opt_model, quantization_mode=QuantizationMode.IntegerOps, symmetric_weight=True, force_fusions=True)
    onnx.save(quantized_onnx_model, quantized_model_path)
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    return onnxruntime.InferenceSession(quantized_model_path, sess_options)
Пример #2
0
    def export_model_to_onnx(self,
                             fpath,
                             quantize=False,
                             target_opset=None,
                             verbose=1):
        """
        Export model to onnx
        Args:
          fpath(str): String representing full path to model file where ONNX model will be saved.
                      Example: '/tmp/my_model.onnx'
          quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 
                         1) ONNX model  (created directly using keras2onnx
                         2) an optimized ONNX model (created by transformers library)
                         3) a quantized version of optimized ONNX model (created by transformers library)
                         All files will be created in the parent folder of fpath:
                         Example: 
                           If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and
                           /tmp/model-optimized-quantized.onnx will also be created.
          verbose(bool): verbosity
        Returns:
          str: string representing fpath.  If quantize=True, returned fpath will be different than supplied fpath
        """
        try:
            import onnxruntime, onnxruntime_tools, onnx, keras2onnx
        except ImportError:
            raise Exception('This method requires ONNX libraries to be installed: '+\
                            'pip install -q --upgrade onnxruntime==1.5.1 onnxruntime-tools onnx keras2onnx')
        from pathlib import Path
        if type(self.preproc).__name__ == 'BERTPreprocessor':
            raise Exception('currently_unsupported:  BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\
                            'Only BERT models created with Transformer(...) are supported.')

        if verbose:
            print(
                'converting to ONNX format ... this may take a few moments...')
        if U.is_huggingface(model=self.model):
            tokenizer = self.preproc.get_tokenizer()
            maxlen = self.preproc.maxlen
            input_dict = tokenizer('Name',
                                   return_tensors='tf',
                                   padding='max_length',
                                   max_length=maxlen)

            if version.parse(tf.__version__) < version.parse('2.2'):
                raise Exception(
                    'export_model_to_tflite requires tensorflow>=2.2')
                #self.model._set_inputs(input_spec, training=False) # for tf < 2.2
            self.model._saved_model_inputs_spec = None  # for tf > 2.2
            self.model._set_save_spec(input_dict)  # for tf > 2.2
            self.model._get_save_spec()

        onnx_model = keras2onnx.convert_keras(self.model,
                                              self.model.name,
                                              target_opset=target_opset)
        keras2onnx.save_model(onnx_model, fpath)
        return_fpath = fpath

        if quantize:
            from transformers.convert_graph_to_onnx import optimize, quantize
            #opt_path = optimize(Path(fpath))

            if U.is_huggingface(model=self.model) and\
               type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']:
                try:
                    from onnxruntime_tools import optimizer
                    from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
                    # disable embedding layer norm optimization for better model size reduction
                    opt_options = BertOptimizationOptions('bert')
                    opt_options.enable_embed_layer_norm = False
                    opt_model = optimizer.optimize_model(
                        fpath,
                        'bert',  # bert_keras causes error with transformers
                        num_heads=12,
                        hidden_size=768,
                        optimization_options=opt_options)
                    opt_model.save_model_to_file(fpath)
                except:
                    warnings.warn('Could not run BERT-specific optimizations')
                    pass
            quantize_path = quantize(Path(fpath))
            return_fpath = quantize_path.as_posix()
        if verbose: print('done.')
        return return_fpath
Пример #3
0
# # An optional step unless
# # you want to get a model with mixed precision for perf accelartion on newer GPU
# # or you are working with Tensorflow(tf.keras) models or pytorch models other than bert

# !pip install onnxruntime-tools
# from onnxruntime_tools import optimizer

# # Mixed precision conversion for bert-base-cased model converted from Pytorch
# optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert', num_heads=12, hidden_size=768)
# optimized_model.convert_model_float32_to_float16()
# optimized_model.save_model_to_file("bert-base-cased.onnx")

# # optimizations for bert-base-cased model converted from Tensorflow(tf.keras)
# optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert_keras', num_heads=12, hidden_size=768)
# optimized_model.save_model_to_file("bert-base-cased.onnx")

# optimize transformer-based models with onnxruntime-tools
from onnxruntime_tools import optimizer
from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions

# disable embedding layer norm optimization for better model size reduction
opt_options = BertOptimizationOptions('bert')
opt_options.enable_embed_layer_norm = False

opt_model = optimizer.optimize_model('onnx/bert-base-cased.onnx',
                                     'bert',
                                     num_heads=12,
                                     hidden_size=768,
                                     optimization_options=opt_options)
opt_model.save_model_to_file('onnx/bert.opt.onnx')