Exemplo n.º 1
0
def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, verbose=False):
    session = None
    try:
        from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
        sess_options = SessionOptions()

        if enable_all_optimization:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC

        if num_threads > 0:
            sess_options.intra_op_num_threads = num_threads
            logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
        elif (not use_gpu) and (version.parse(onnxruntime_version) < version.parse('1.3.0')):
            # Set intra_op_num_threads = 1 to enable OpenMP for onnxruntime 1.2.0 (cpu)
            # onnxruntime-gpu is not built with openmp so it is better to use default (0) or cpu_count instead.
            sess_options.intra_op_num_threads = 1

        if verbose:
            sess_options.log_severity_level = 0

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        execution_providers = ['CPUExecutionProvider'
                               ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
        session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 2
0
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: 
    assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"
    # Few properties than might have an impact on performances (provided by MS)
    options = SessionOptions()
    options.intra_op_num_threads = 1
    # Load the model as a graph and prepare the CPU backend 
    return InferenceSession(model_path, options, providers=[provider])
Exemplo n.º 3
0
def create_onnxruntime_session(onnx_model_path, use_gpu, verbose):
    session = None
    try:
        from onnxruntime import SessionOptions, InferenceSession
        sess_options = SessionOptions()
        if not use_gpu:
            sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
            logger.debug(
                f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}"
            )

        if verbose:
            sess_options.log_severity_level = 0

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        execution_providers = ['CPUExecutionProvider'] if not use_gpu else [
            'CUDAExecutionProvider', 'CPUExecutionProvider'
        ]
        session = InferenceSession(onnx_model_path,
                                   sess_options,
                                   providers=execution_providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 4
0
    def create_onnx_session(self,
                            onnx_model_path,
                            provider='CPUExecutionProvider'):
        """
        Creates ONNX inference session from provided onnx_model_path
        """

        from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers
        assert provider in get_all_providers(
        ), f"provider {provider} not found, {get_all_providers()}"

        # Few properties that might have an impact on performances (provided by MS)
        options = SessionOptions()
        options.intra_op_num_threads = 0
        options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        # Load the model as a graph and prepare the CPU backend
        session = InferenceSession(onnx_model_path,
                                   options,
                                   providers=[provider])
        session.disable_fallback()

        #if 'OMP_NUM_THREADS' not in os.environ or 'OMP_WAIT_POLICY' not in os.environ:
        #warnings.warn('''We recommend adding the following at top of script for CPU inference:

        #from psutil import cpu_count
        ##Constants from the performance optimization available in onnxruntime
        ##It needs to be done before importing onnxruntime
        #os.environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
        #os.environ["OMP_WAIT_POLICY"] = 'ACTIVE'
        #''')
        return session
Exemplo n.º 5
0
def create_ort_session(onnx_model_path, use_gpu=True):
    from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
    sess_options = SessionOptions()
    sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
    sess_options.intra_op_num_threads = 2
    sess_options.log_severity_level = 2
    execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
    return InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
Exemplo n.º 6
0
def create_onnx_session(onnx_model_path):
    provider = 'CPUExecutionProvider'
    from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers
    assert provider in get_all_providers(
    ), f"provider {provider} not found, {get_all_providers()}"
    options = SessionOptions()
    options.intra_op_num_threads = 0
    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
    session = InferenceSession(onnx_model_path, options, providers=[provider])
    session.disable_fallback()
    return session
Exemplo n.º 7
0
def create_model_for_provider(
        model_path: str,
        provider: str = 'CPUExecutionProvider') -> InferenceSession:
    assert provider in get_all_providers(
    ), f"provider {provider} not found, {get_all_providers()}"
    # Few properties that might have an impact on performances (provided by MS)
    options = SessionOptions()
    options.intra_op_num_threads = int(os.environ.get('NUM_THREADS', 4))
    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
    # Load the model as a graph and prepare the CPU backend
    session = InferenceSession(model_path, options, providers=[provider])
    session.disable_fallback()
    return session
Exemplo n.º 8
0
    def create_model_for_provider(self):

        assert self.provider in get_all_providers(), f"provider {self.provider} not found, {get_all_providers()}"

        # Few properties that might have an impact on performances (provided by MS)
        options = SessionOptions()
        options.intra_op_num_threads = 1
        options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        # Load the model as a graph and prepare the CPU backend
        session = InferenceSession(self.model_path, options, providers=[self.provider])
        session.disable_fallback()

        return session
Exemplo n.º 9
0
    def convert_to_onnx(self, onnx_output_dir=None, set_onnx_arg=True):
        """Convert the model to ONNX format and save to output_dir
        Args:
            onnx_output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None.
            set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True.
        """  # noqa
        if not onnx_output_dir:
            onnx_output_dir = os.path.join(self.options.output_dir,
                                           self.options.model_type,
                                           self.options.model_name, "onnx")
        os.makedirs(onnx_output_dir, exist_ok=True)

        if not os.listdir(onnx_output_dir):
            onnx_model_name = os.path.join(onnx_output_dir, "onnx_model.onnx")
            with tempfile.TemporaryDirectory() as temp_dir:
                basedir = os.path.basename(temp_dir)
                temp_dir = os.path.join(self.options.output_dir, basedir)
                self.save_model(output_dir=temp_dir, model=self.model)

                convert(
                    framework="pt",
                    model=temp_dir,
                    tokenizer=self.tokenizer,
                    output=Path(onnx_model_name),
                    pipeline_name="ner",
                    opset=11,
                )
            self.tokenizer.save_pretrained(onnx_output_dir)
            self.config.save_pretrained(onnx_output_dir)

        onnx_options = SessionOptions()
        use_cuda = True if self._device.type != 'cpu' else False
        onnx_execution_provider = "CUDAExecutionProvider" if use_cuda else "CPUExecutionProvider"
        onnx_options.intra_op_num_threads = 1
        onnx_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
        onnx_model_path = os.path.join(onnx_output_dir, "onnx_model.onnx")
        if self.options.dynamic_quantize:
            # Append "-quantized" at the end of the model's name
            quantized_model_path = generate_identified_filename(
                Path(onnx_model_path), "-quantized")
            quantize_dynamic(Path(onnx_model_path), quantized_model_path)
            onnx_model_path = quantized_model_path.as_posix()

        return InferenceSession(onnx_model_path,
                                onnx_options,
                                providers=[onnx_execution_provider])
Exemplo n.º 10
0
def create_onnxruntime_session(onnx_model_path,
                               use_gpu,
                               enable_all_optimization=True,
                               num_threads=-1,
                               enable_profiling=False,
                               verbose=False,
                               use_dml=False):
    session = None
    try:
        from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
        sess_options = SessionOptions()

        if enable_all_optimization:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC

        if enable_profiling:
            sess_options.enable_profiling = True

        if num_threads > 0:
            sess_options.intra_op_num_threads = num_threads
            logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")

        if verbose:
            sess_options.log_severity_level = 0
        else:
            sess_options.log_severity_level = 4

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        if use_gpu:
            if use_dml:
                execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
            else:
                execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
        else:
            execution_providers = ['CPUExecutionProvider']
        session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 11
0
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession:
    """
    这里解释一下ExecutionProvider,ONNXRuntime用Provider表示不同的运行设备比如CUDAProvider等。
    目前ONNX Runtime v1.0支持了包括CPU,CUDA,TensorRT,MKL等七种Providers。
    :param model_path:
    :param provider:
    :return:
    """
    assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

    # Few properties that might have an impact on performances (provided by MS)
    options = SessionOptions()
    options.intra_op_num_threads = 1
    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

    # Load the model as a graph and prepare the CPU backend
    session = InferenceSession(model_path, options, providers=[provider])
    session.disable_fallback()

    return session
Exemplo n.º 12
0
def create_model_for_provider(model_path: str, provider: str,
                              optimization_level: str) -> InferenceSession:

    assert provider in get_all_providers(
    ), f"provider {provider} not found, {get_all_providers()}"

    # Few properties that might have an impact on performances (provided by MS)
    options = SessionOptions()
    options.intra_op_num_threads = 1
    if optimization_level in GRAPH_OPTIMIZATIONS:
        options.graph_optimization_level = GRAPH_OPTIMIZATIONS[
            optimization_level]
    else:
        raise KeyError(
            f"Unknown Optimization Level {optimization_level} (Available optimization level are all/disable_all/basic/extended)"
        )

    # Load the model as a graph and prepare the CPU backend
    session = InferenceSession(model_path, options, providers=[provider])
    session.disable_fallback()

    return session
Exemplo n.º 13
0
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = get_args_from_command_line()

    start = time.time()
    tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model)
    tokens = tokenizer.encode_plus(args.text)
    tokens = {name: np.atleast_2d(value) for name, value in tokens.items()}
    end_tokenizer = time.time()
    print("Tokenization time: ", end_tokenizer - start)

    start_session = time.time()
    options= SessionOptions()
    options.intra_op_num_threads=1
    options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
    #options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
    session = InferenceSession(args.onnx_model_path, options)
    end_session = time.time()
    print("Setting up session time: ", end_session - start_session)

    start_predict = time.time()
    output, = session.run(None, tokens)
    end_predict = time.time()
    print("Predict time: ", end_predict - start_predict)

    print("Overall time: ", end_predict - start)
    print(output)
Exemplo n.º 14
0
def create_onnxruntime_session(
        onnx_model_path,
        use_gpu,
        provider=None,
        enable_all_optimization=True,
        num_threads=-1,
        enable_profiling=False,
        verbose=False,
        provider_options={},  # map execution provider name to its option
):
    session = None
    try:
        from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions

        sess_options = SessionOptions()

        if enable_all_optimization:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC

        if enable_profiling:
            sess_options.enable_profiling = True

        if num_threads > 0:
            sess_options.intra_op_num_threads = num_threads
            logger.debug(
                f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}"
            )

        if verbose:
            sess_options.log_severity_level = 0
        else:
            sess_options.log_severity_level = 4

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        if use_gpu:
            if provider == "dml":
                providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
            elif provider == "rocm":
                providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
            elif provider == "migraphx":
                providers = [
                    "MIGraphXExecutionProvider",
                    "ROCMExecutionProvider",
                    "CPUExecutionProvider",
                ]
            elif provider == "cuda":
                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
            elif provider == "tensorrt":
                providers = [
                    "TensorrtExecutionProvider",
                    "CUDAExecutionProvider",
                    "CPUExecutionProvider",
                ]
            else:
                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
        else:
            providers = ["CPUExecutionProvider"]

        if provider_options:
            providers = [
                (name,
                 provider_options[name]) if name in provider_options else name
                for name in providers
            ]

        session = InferenceSession(onnx_model_path,
                                   sess_options,
                                   providers=providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 15
0
def get_onnx_runtime_sessions(
    model_paths,
    default: bool = True,
    opt_level: int = 99,
    parallel_exe_mode: bool = True,
    n_threads: int = 4,
    provider=[
        'CPUExecutionProvider',
    ],
) -> InferenceSession:
    '''
            Optimizes the model 

    Args:
        path_to_encoder (str) : the path of input onnx encoder model.
        path_to_decoder (str) : the path of input onnx decoder model.
        path_to_initial_decoder (str) :  the path of input initial onnx decoder model.
        opt_level (int) : sess_options.GraphOptimizationLevel param if set 1 uses 'ORT_ENABLE_BASIC', 
                          2 for 'ORT_ENABLE_EXTENDED' and 99 for 'ORT_ENABLE_ALL',
                          default value is set to 99.
        parallel_exe_mode (bool) :  Sets the execution mode. Default is parallel.
        n_threads (int) :  Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose
        provider : execution providers list.
        default : set this to true, ort will choose the best settings for your hardware.
                  (you can test out different settings for better results.)
  
    Returns:
        encoder_session : encoder onnx InferenceSession 
        decoder_session : decoder onnx InferenceSession
        decoder_sess_init : initial decoder onnx InferenceSession 

    '''
    path_to_encoder, path_to_decoder, path_to_initial_decoder = model_paths

    if default:

        encoder_sess = InferenceSession(str(path_to_encoder))

        decoder_sess = InferenceSession(str(path_to_decoder))

        decoder_sess_init = InferenceSession(str(path_to_initial_decoder))

    else:

        # Few properties that might have an impact on performances
        options = SessionOptions()

        if opt_level == 1:
            options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
        elif opt_level == 2:
            options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_EXTENDED
        else:
            assert opt_level == 99
            options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        # set this true for better performance
        if parallel_exe_mode == True:
            options.execution_mode = ExecutionMode.ORT_PARALLEL
        else:
            options.execution_mode = ExecutionMode.ORT_SEQUENTIAL

        options.intra_op_num_threads = n_threads
        # options.inter_op_num_threads = 10

        # options.enable_profiling = True

        encoder_sess = InferenceSession(str(path_to_encoder),
                                        options,
                                        providers=provider)

        decoder_sess = InferenceSession(str(path_to_decoder),
                                        options,
                                        providers=provider)

        decoder_sess_init = InferenceSession(str(path_to_initial_decoder),
                                             options,
                                             providers=provider)

    return encoder_sess, decoder_sess, decoder_sess_init