示例#1
0
    def convert(self, model, saved_path, model_name):
        """
        Convert bert model (transformers) to onnx optimized model.
        :param model: Trained model from transformers.
        :param saved_path: The path to save onnx model.
        :param model_name: Choose a model name to save.
        :returns
            optimized_model: Optimized onnx model.
            optimized_model_saved_path: optimized model saved path.
        """
        if not os.path.exists(saved_path):
            os.makedirs(saved_path)
        unoptimized_model_saved_path = os.path.join(
            saved_path, '{}.onnx'.format(model_name))
        optimized_model_saved_path = os.path.join(
            saved_path, '{}_optimized.onnx'.format(model_name))
        self.sample_inputs = self.tokenizer.encode_plus(
            "This is a sample input", return_tensors='tf')
        # Step 1: Convert origin transformers model to unoptimized ONNX model.
        model.predict(self.sample_inputs.data)
        unoptimized_model = convert_keras(model,
                                          model.name,
                                          target_opset=self.target_opset)
        save_model(unoptimized_model, unoptimized_model_saved_path)

        # Step 2: optimizations for trained model converted from Tensorflow(tf.keras)
        optimized_model = optimizer.optimize_model(
            unoptimized_model_saved_path,
            model_type='bert_keras',
            num_heads=self.num_heads,
            hidden_size=self.hidden_size)
        optimized_model.save_model_to_file(optimized_model_saved_path)

        return optimized_model, optimized_model_saved_path
def tf_keras_convert_to_onnx(models, paths, config):
    """
    将keras模型转换为onnx
    :param models:
    :param paths:
    :param config:
    :return:
    """
    onnxNerBert = keras2onnx.convert_keras(models,
                                           models.name,
                                           target_opset=12)
    keras2onnx.save_model(onnxNerBert, paths)
    optimized_model = optimizer.optimize_model(
        paths,
        model_type='bert_keras',
        num_heads=config.num_attention_heads,
        hidden_size=config.hidden_size)
    optimized_model.use_dynamic_axes()
    optimized_model.save_model_to_file(paths)
示例#3
0
def export():
    shutil.rmtree("onnx", ignore_errors=1)
    model = Classification.from_pretrained("model")
    model.base_model.save_pretrained("./bertBase")
    convert(
        framework="pt",
        model="bertBase",  # CHANGED: refer to custom model
        tokenizer=get_tokenizer(),  # <-- CHANGED: add tokenizer
        output=Path("onnx/bert-base-cased.onnx"),
        opset=12,
    )

    # # Mixed precision conversion for bert-base-cased model converted from Pytorch
    optimized_model = optimizer.optimize_model(
        "onnx/bert-base-cased.onnx",  # CHANGED: original `bert-base-cased.onnx` didn't point to right directory
        model_type="bert",
        num_heads=12,
        hidden_size=768,
    )
    optimized_model.convert_model_float32_to_float16()
    optimized_model.save_model_to_file("onnx/bert-base-cased.onnx")
示例#4
0
def convert_to_onnx(model: PreTrainedModel, output_path, opset: int = 12):
    onnx_output_path = os.path.join(output_path,
                                    'checkpoint_without_optimize.onnx')
    onnx_optimized_output_path = os.path.join(output_path,
                                              'checkpoint_with_optimize.onnx')
    onnx_optimized_fp16_output_path = os.path.join(
        output_path, 'checkpoint_with_optimize_fp16.onnx')

    model.eval()
    with torch.no_grad():
        input_names, output_names, dynamic_axes, tokens = infer_shapes(
            tmp_model, tmp_tokenizer)
        ordered_input_names, model_args = ensure_valid_input(
            model, tokens, input_names)
        print(f"Model input names: {ordered_input_names}.")
        export(model,
               model_args,
               onnx_output_path,
               input_names=ordered_input_names,
               output_names=output_names,
               dynamic_axes=dynamic_axes,
               verbose=True,
               opset_version=opset)
        print(
            f"Finished output checkpoint_without_optimize.onnx to {output_path}."
        )

    optimized_model = optimizer.optimize_model(onnx_output_path,
                                               model_type='bert',
                                               num_heads=12,
                                               hidden_size=768,
                                               use_gpu=True)
    optimized_model.save_model_to_file(onnx_optimized_output_path)
    print(f"Finished output checkpoint_with_optimize.onnx to {output_path}.")
    optimized_model.convert_model_float32_to_float16()
    optimized_model.save_model_to_file(onnx_optimized_fp16_output_path)
    print(
        f"Finished output checkpoint_with_optimize_fp16.onnx to {output_path}."
    )
def _build_onnxrt_session(model):
    # using https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers
    dummy_input = {'input_ids':      torch.ones(1,128, dtype=torch.int64),
                   'attention_mask': torch.ones(1,128, dtype=torch.int64),
                   'token_type_ids': torch.ones(1,128, dtype=torch.int64)}
    symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
    onnx_model_path = "/tmp/temp_turbo_onnx.model"
    onnx_opt_model_path = "/tmp/temp_turbo_onnx_opt.model"
    quantized_model_path = "/tmp/temp_turbo_onnx_q.model"
    # (1) export to onnx fp32 model
    with open(onnx_model_path, 'wb') as f:
        torch.onnx.export(model, (dummy_input['input_ids'], dummy_input['attention_mask'], dummy_input['token_type_ids']),
                          f, input_names=['input_ids', 'attention_mask', 'token_type_ids'], output_names=['output'],
                          opset_version=11,
                          dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names, 'token_type_ids': symbolic_names})
    # (2) optimize the fp32 model
    from onnxruntime_tools import optimizer
    from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
    opt_options = BertOptimizationOptions('bert')
    opt_options.enable_embed_layer_norm = False
    opt_model = optimizer.optimize_model(
        onnx_model_path,
        'bert', 
        num_heads=model.config.num_attention_heads,
        hidden_size=model.config.hidden_size,
        optimization_options=opt_options)
    opt_model.save_model_to_file(onnx_opt_model_path)
    # (3) quantize the model
    from onnxruntime.quantization import quantize, QuantizationMode
    import onnx
    import onnxruntime
    import onnxruntime.backend
    opt_model = onnx.load(onnx_opt_model_path)
    quantized_onnx_model = quantize(opt_model, quantization_mode=QuantizationMode.IntegerOps, symmetric_weight=True, force_fusions=True)
    onnx.save(quantized_onnx_model, quantized_model_path)
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    return onnxruntime.InferenceSession(quantized_model_path, sess_options)
示例#6
0
def convert_to_onnx(model_path, model_name, output_path):
    output_path = pathlib.Path(output_path)
    convert(framework="pt", model=model_path, tokenizer=model_name, output=output_path, opset=11)
    if model_name in ('gpt2',): # gpt2-medium not supported yet
        optimized_model = optimizer.optimize_model(output_path, model_type=model_name)
        optimized_model.save_model_to_file(output_path)
示例#7
0
    def export_model_to_onnx(self,
                             fpath,
                             quantize=False,
                             target_opset=None,
                             verbose=1):
        """
        Export model to onnx
        Args:
          fpath(str): String representing full path to model file where ONNX model will be saved.
                      Example: '/tmp/my_model.onnx'
          quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 
                         1) ONNX model  (created directly using keras2onnx
                         2) an optimized ONNX model (created by transformers library)
                         3) a quantized version of optimized ONNX model (created by transformers library)
                         All files will be created in the parent folder of fpath:
                         Example: 
                           If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and
                           /tmp/model-optimized-quantized.onnx will also be created.
          verbose(bool): verbosity
        Returns:
          str: string representing fpath.  If quantize=True, returned fpath will be different than supplied fpath
        """
        try:
            import onnxruntime, onnxruntime_tools, onnx, keras2onnx
        except ImportError:
            raise Exception('This method requires ONNX libraries to be installed: '+\
                            'pip install -q --upgrade onnxruntime==1.5.1 onnxruntime-tools onnx keras2onnx')
        from pathlib import Path
        if type(self.preproc).__name__ == 'BERTPreprocessor':
            raise Exception('currently_unsupported:  BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\
                            'Only BERT models created with Transformer(...) are supported.')

        if verbose:
            print(
                'converting to ONNX format ... this may take a few moments...')
        if U.is_huggingface(model=self.model):
            tokenizer = self.preproc.get_tokenizer()
            maxlen = self.preproc.maxlen
            input_dict = tokenizer('Name',
                                   return_tensors='tf',
                                   padding='max_length',
                                   max_length=maxlen)

            if version.parse(tf.__version__) < version.parse('2.2'):
                raise Exception(
                    'export_model_to_tflite requires tensorflow>=2.2')
                #self.model._set_inputs(input_spec, training=False) # for tf < 2.2
            self.model._saved_model_inputs_spec = None  # for tf > 2.2
            self.model._set_save_spec(input_dict)  # for tf > 2.2
            self.model._get_save_spec()

        onnx_model = keras2onnx.convert_keras(self.model,
                                              self.model.name,
                                              target_opset=target_opset)
        keras2onnx.save_model(onnx_model, fpath)
        return_fpath = fpath

        if quantize:
            from transformers.convert_graph_to_onnx import optimize, quantize
            #opt_path = optimize(Path(fpath))

            if U.is_huggingface(model=self.model) and\
               type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']:
                try:
                    from onnxruntime_tools import optimizer
                    from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
                    # disable embedding layer norm optimization for better model size reduction
                    opt_options = BertOptimizationOptions('bert')
                    opt_options.enable_embed_layer_norm = False
                    opt_model = optimizer.optimize_model(
                        fpath,
                        'bert',  # bert_keras causes error with transformers
                        num_heads=12,
                        hidden_size=768,
                        optimization_options=opt_options)
                    opt_model.save_model_to_file(fpath)
                except:
                    warnings.warn('Could not run BERT-specific optimizations')
                    pass
            quantize_path = quantize(Path(fpath))
            return_fpath = quantize_path.as_posix()
        if verbose: print('done.')
        return return_fpath

if __name__ == '__main__':
    args = get_args_from_command_line()
    #text = "tick tock tick"
    #convert_bert_to_onnx('tick tock', args.model_dir, args.onnx_model_path)
    #remove_initializer_from_input(args.onnx_model_path, args.onnx_model_path)
    convert(framework="pt",
            model=args.model_dir,
            tokenizer="DeepPavlov/bert-base-cased-conversational",
            output=args.onnx_model_path,
            opset=11)

    # ONNX optimization
    optimized_model = optimizer.optimize_model(args.onnx_model_path,
                                               model_type='bert',
                                               num_heads=12,
                                               hidden_size=768)
    optimized_onnx_model_path = os.path.join(
        os.path.dirname(args.onnx_model_path), 'bert_optimized.onnx')
    optimized_model.save_model_to_file(optimized_onnx_model_path)
    print('Optimized model saved at :', optimized_onnx_model_path)
    # ONNX quantization
    model = onnx.load(optimized_onnx_model_path)
    quantized_model = quantize(model,
                               quantization_mode=QuantizationMode.IntegerOps,
                               static=False)
    optimized_quantized_onnx_model_path = os.path.join(
        os.path.dirname(optimized_onnx_model_path),
        'bert_optimized_quantized.onnx')
    onnx.save(quantized_model, optimized_quantized_onnx_model_path)
    print('Quantized&optimized model saved at :',
示例#9
0
        )  # deleting onxx folder and contents, if exists, conversion excepts
    except:
        print('no existing folder, creating one')
        os.makedirs(onnx_path)

    print('>> converting..')
    convert(framework="pt",
            model=model_path,
            tokenizer=args.model_name,
            output=onnx_path + 'converted.onnx',
            opset=11)

    print('>> optimizing..')
    # ONNX optimization
    optimized_model = optimizer.optimize_model(onnx_path + '/converted.onnx',
                                               model_type=args.model_type,
                                               num_heads=12,
                                               hidden_size=768)

    optimized_onnx_model_path = os.path.join(onnx_path, 'bert_optimized.onnx')
    optimized_model.save_model_to_file(optimized_onnx_model_path)
    print('Optimized model saved at :', optimized_onnx_model_path)

    print('>> quantizing..')
    model = onnx.load(onnx_path + '/converted.onnx')
    quantized_model = quantize(model=model,
                               quantization_mode=QuantizationMode.IntegerOps,
                               force_fusions=True,
                               symmetric_weight=True)
    optimized_quantized_onnx_model_path = os.path.join(
        os.path.dirname(optimized_onnx_model_path),
        'ONNX_model_optimized_quantized.onnx')
示例#10
0
    def convert_to_onnx(cls,
                        model_name,
                        output_path,
                        task_type,
                        convert_to_float16=False,
                        quantize=False,
                        opset_version=11):
        """
        Convert a PyTorch model from transformers hub to an ONNX Model.

        :param model_name: transformers model name
        :type model_name: str
        :param output_path: output Path to write the converted to
        :type output_path: Path
        :param task_type: Type of task for the model. Available options: "embeddings", "question_answering",
                          "text_classification", "ner".
        :param convert_to_float16: By default, the model use float32 precision. With half precision of flaot16, inference
                                should be faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, float32
                                might be more performant.
        :type convert_to_float16: bool
        :param quantize: convert floating point number to integers
        :type quantize: bool
        :param opset_version: ONNX opset version
        :type opset_version: int
        :return:
        """
        language_model_class = LanguageModel.get_language_model_class(
            model_name)
        if language_model_class not in ["Bert", "Roberta", "XLMRoberta"]:
            raise Exception(
                "The current ONNX conversion only support 'BERT', 'RoBERTa', and 'XLMRoberta' models."
            )

        task_type_to_pipeline_map = {
            "question_answering": "question-answering",
            "embeddings": "feature-extraction",
            "ner": "ner"
        }

        convert(pipeline_name=task_type_to_pipeline_map[task_type],
                framework="pt",
                model=model_name,
                output=output_path / "model.onnx",
                opset=opset_version,
                use_external_format=True
                if language_model_class is "XLMRoberta" else False)

        # save processor & model config files that are needed when loading the model with the FARM Inferencer
        processor = Processor.convert_from_transformers(
            tokenizer_name_or_path=model_name,
            task_type=task_type,
            max_seq_len=256,
            doc_stride=128,
            use_fast=True)
        processor.save(output_path)
        model = AdaptiveModel.convert_from_transformers(model_name,
                                                        device="cpu",
                                                        task_type=task_type)
        model.save(output_path)
        os.remove(
            output_path / "language_model.bin"
        )  # remove the actual PyTorch model(only configs are required)

        onnx_model_config = {
            "task_type": task_type,
            "onnx_opset_version": opset_version,
            "language_model_class": language_model_class,
            "language": model.language_model.language
        }
        with open(output_path / "onnx_model_config.json", "w") as f:
            json.dump(onnx_model_config, f)

        if convert_to_float16:
            from onnxruntime_tools import optimizer
            config = AutoConfig.from_pretrained(model_name)
            optimized_model = optimizer.optimize_model(
                input=str(output_path / "model.onnx"),
                model_type='bert',
                num_heads=config.num_hidden_layers,
                hidden_size=config.hidden_size)
            optimized_model.convert_model_float32_to_float16()
            optimized_model.save_model_to_file("model.onnx")

        if quantize:
            quantize_model(output_path / "model.onnx")
示例#11
0
for ep in ep_list:
    dev = ep_dev_map.get(ep)
    dynamic = {
        "non_dynamic": export_model_path,
        "dynamic": export_model_path_dynamic
    }
    for k, v in dynamic.items():
        print(k)
        # This will save the optimized graph to the directory specified in optimized_model_filepath
        sess_options.optimized_model_filepath = os.path.join(
            output_dir, "optimized_model_{}.onnx".format(dev))

        # Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.
        # Note that this will increase session creation time so enable it for debugging only.
        optimized_model = optimizer.optimize_model(v,
                                                   model_type='bert',
                                                   num_heads=12,
                                                   hidden_size=768)
        optimized_model.save_model_to_file(
            sess_options.optimized_model_filepath)

        # Please change the value according to best setting in Performance Test Tool result.
        sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)

        session = onnxruntime.InferenceSession(
            sess_options.optimized_model_filepath, sess_options)
        session.set_providers([ep])
        latency = []
        for i in range(total_samples):
            data = dataset[i]
            # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.
            ort_inputs = {
示例#12
0
# # An optional step unless
# # you want to get a model with mixed precision for perf accelartion on newer GPU
# # or you are working with Tensorflow(tf.keras) models or pytorch models other than bert

# !pip install onnxruntime-tools
# from onnxruntime_tools import optimizer

# # Mixed precision conversion for bert-base-cased model converted from Pytorch
# optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert', num_heads=12, hidden_size=768)
# optimized_model.convert_model_float32_to_float16()
# optimized_model.save_model_to_file("bert-base-cased.onnx")

# # optimizations for bert-base-cased model converted from Tensorflow(tf.keras)
# optimized_model = optimizer.optimize_model("bert-base-cased.onnx", model_type='bert_keras', num_heads=12, hidden_size=768)
# optimized_model.save_model_to_file("bert-base-cased.onnx")

# optimize transformer-based models with onnxruntime-tools
from onnxruntime_tools import optimizer
from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions

# disable embedding layer norm optimization for better model size reduction
opt_options = BertOptimizationOptions('bert')
opt_options.enable_embed_layer_norm = False

opt_model = optimizer.optimize_model('onnx/bert-base-cased.onnx',
                                     'bert',
                                     num_heads=12,
                                     hidden_size=768,
                                     optimization_options=opt_options)
opt_model.save_model_to_file('onnx/bert.opt.onnx')