예제 #1
0
    def __init__(self, model_path, tokenizer_path, tag_predict_model):
        options = SessionOptions()
        options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
        self.model = ort.InferenceSession(model_path, options)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

        self.tag_model = tag_predict_model
예제 #2
0
    def convert_to_onnx(self, onnx_output_dir=None, set_onnx_arg=True):
        """Convert the model to ONNX format and save to output_dir
        Args:
            onnx_output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None.
            set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True.
        """  # noqa
        if not onnx_output_dir:
            onnx_output_dir = os.path.join(self.options.output_dir,
                                           self.options.model_type,
                                           self.options.model_name, "onnx")
        os.makedirs(onnx_output_dir, exist_ok=True)

        if not os.listdir(onnx_output_dir):
            onnx_model_name = os.path.join(onnx_output_dir, "onnx_model.onnx")
            with tempfile.TemporaryDirectory() as temp_dir:
                basedir = os.path.basename(temp_dir)
                temp_dir = os.path.join(self.options.output_dir, basedir)
                self.save_model(output_dir=temp_dir, model=self.model)

                convert(
                    framework="pt",
                    model=temp_dir,
                    tokenizer=self.tokenizer,
                    output=Path(onnx_model_name),
                    pipeline_name="ner",
                    opset=11,
                )
            self.tokenizer.save_pretrained(onnx_output_dir)
            self.config.save_pretrained(onnx_output_dir)

        onnx_options = SessionOptions()
        use_cuda = True if self._device.type != 'cpu' else False
        onnx_execution_provider = "CUDAExecutionProvider" if use_cuda else "CPUExecutionProvider"
        onnx_options.intra_op_num_threads = 1
        onnx_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
        onnx_model_path = os.path.join(onnx_output_dir, "onnx_model.onnx")
        if self.options.dynamic_quantize:
            # Append "-quantized" at the end of the model's name
            quantized_model_path = generate_identified_filename(
                Path(onnx_model_path), "-quantized")
            quantize_dynamic(Path(onnx_model_path), quantized_model_path)
            onnx_model_path = quantized_model_path.as_posix()

        return InferenceSession(onnx_model_path,
                                onnx_options,
                                providers=[onnx_execution_provider])
예제 #3
0
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = get_args_from_command_line()

    start = time.time()
    tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model)
    tokens = tokenizer.encode_plus(args.text)
    tokens = {name: np.atleast_2d(value) for name, value in tokens.items()}
    end_tokenizer = time.time()
    print("Tokenization time: ", end_tokenizer - start)

    start_session = time.time()
    options= SessionOptions()
    options.intra_op_num_threads=1
    options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
    #options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
    session = InferenceSession(args.onnx_model_path, options)
    end_session = time.time()
    print("Setting up session time: ", end_session - start_session)

    start_predict = time.time()
    output, = session.run(None, tokens)
    end_predict = time.time()
    print("Predict time: ", end_predict - start_predict)

    print("Overall time: ", end_predict - start)
    print(output)
예제 #4
0
def get_onnx_runtime_sessions(
    model_paths,
    default: bool = True,
    opt_level: int = 99,
    parallel_exe_mode: bool = True,
    n_threads: int = 4,
    provider=[
        'CPUExecutionProvider',
    ],
) -> InferenceSession:
    '''
            Optimizes the model 

    Args:
        path_to_encoder (str) : the path of input onnx encoder model.
        path_to_decoder (str) : the path of input onnx decoder model.
        path_to_initial_decoder (str) :  the path of input initial onnx decoder model.
        opt_level (int) : sess_options.GraphOptimizationLevel param if set 1 uses 'ORT_ENABLE_BASIC', 
                          2 for 'ORT_ENABLE_EXTENDED' and 99 for 'ORT_ENABLE_ALL',
                          default value is set to 99.
        parallel_exe_mode (bool) :  Sets the execution mode. Default is parallel.
        n_threads (int) :  Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose
        provider : execution providers list.
        default : set this to true, ort will choose the best settings for your hardware.
                  (you can test out different settings for better results.)
  
    Returns:
        encoder_session : encoder onnx InferenceSession 
        decoder_session : decoder onnx InferenceSession
        decoder_sess_init : initial decoder onnx InferenceSession 

    '''
    path_to_encoder, path_to_decoder, path_to_initial_decoder = model_paths

    if default:

        encoder_sess = InferenceSession(str(path_to_encoder))

        decoder_sess = InferenceSession(str(path_to_decoder))

        decoder_sess_init = InferenceSession(str(path_to_initial_decoder))

    else:

        # Few properties that might have an impact on performances
        options = SessionOptions()

        if opt_level == 1:
            options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
        elif opt_level == 2:
            options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_EXTENDED
        else:
            assert opt_level == 99
            options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        # set this true for better performance
        if parallel_exe_mode == True:
            options.execution_mode = ExecutionMode.ORT_PARALLEL
        else:
            options.execution_mode = ExecutionMode.ORT_SEQUENTIAL

        options.intra_op_num_threads = n_threads
        # options.inter_op_num_threads = 10

        # options.enable_profiling = True

        encoder_sess = InferenceSession(str(path_to_encoder),
                                        options,
                                        providers=provider)

        decoder_sess = InferenceSession(str(path_to_decoder),
                                        options,
                                        providers=provider)

        decoder_sess_init = InferenceSession(str(path_to_initial_decoder),
                                             options,
                                             providers=provider)

    return encoder_sess, decoder_sess, decoder_sess_init