Exemplo n.º 1
0
def create_onnxruntime_session(onnx_model_path,
                               use_gpu,
                               enable_all_optimization=True,
                               num_threads=-1,
                               enable_profiling=False,
                               verbose=False,
                               use_dml=False):
    session = None
    try:
        from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
        sess_options = SessionOptions()

        if enable_all_optimization:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC

        if enable_profiling:
            sess_options.enable_profiling = True

        if num_threads > 0:
            sess_options.intra_op_num_threads = num_threads
            logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")

        if verbose:
            sess_options.log_severity_level = 0
        else:
            sess_options.log_severity_level = 4

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        if use_gpu:
            if use_dml:
                execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
            else:
                execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
        else:
            execution_providers = ['CPUExecutionProvider']
        session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 2
0
def create_onnxruntime_session(onnx_model_path,
                               use_gpu,
                               enable_all_optimization=True,
                               num_threads=-1,
                               enable_profiling=False,
                               verbose=False):
    session = None
    try:
        from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
        sess_options = SessionOptions()

        if enable_all_optimization:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC

        if enable_profiling:
            sess_options.enable_profiling = True

        if num_threads > 0:
            sess_options.intra_op_num_threads = num_threads
            logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
        elif (not use_gpu) and (version.parse(onnxruntime_version) < version.parse('1.3.0')):
            # Set intra_op_num_threads = 1 to enable OpenMP for onnxruntime 1.2.0 (cpu)
            # onnxruntime-gpu is not built with openmp so it is better to use default (0) or cpu_count instead.
            sess_options.intra_op_num_threads = 1

        if verbose:
            sess_options.log_severity_level = 0
        else:
            sess_options.log_severity_level = 4

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        execution_providers = ['CPUExecutionProvider'
                               ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
        session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 3
0
def run_onnx_runtime(case_name, onnx_model, data, expected, model_files, rtol=1.e-3, atol=1.e-6,
                     compare_perf=False, enable_profiling=False):
    if not os.path.exists(tmp_path):
        os.mkdir(tmp_path)
    temp_model_file = os.path.join(tmp_path, 'temp_' + case_name + '.onnx')
    onnx.save_model(onnx_model, temp_model_file)
    try:
        import onnxruntime
        if enable_profiling:
            from onnxruntime import SessionOptions
            sess_options = SessionOptions()
            sess_options.enable_profiling = True
            sess = onnxruntime.InferenceSession(temp_model_file, sess_options)
        else:
            sess = onnxruntime.InferenceSession(temp_model_file)
    except ImportError:
        mock_keras2onnx.common.k2o_logger().warning("Cannot import ONNXRuntime!")
        return True

    if isinstance(data, dict):
        feed_input = data
    else:
        data = data if isinstance(data, list) else [data]
        input_names = sess.get_inputs()
        # to avoid too complicated test code, we restrict the input name in Keras test cases must be
        # in alphabetical order. It's always true unless there is any trick preventing that.
        feed = zip(sorted(i_.name for i_ in input_names), data)
        feed_input = dict(feed)
    actual = sess.run(None, feed_input)
    if compare_perf:
        count = 10
        time_start = time.time()
        for i in range(count):
            sess.run(None, feed_input)
        time_end = time.time()
        print('avg ort time =' + str((time_end - time_start)/count))

    if enable_profiling:
        profile_file = sess.end_profiling()
        profile_records = load_profile_json(profile_file)
        lines = parse_profile_results(profile_records)
        print("Results:")
        print("-" * 64)
        for line in lines:
            print(line)

    if expected is None:
        return

    if isinstance(expected, tuple):
        expected = list(expected)
    elif not isinstance(expected, list):
        expected = [expected]

    res = all(np.allclose(expected[n_], actual[n_], rtol=rtol, atol=atol) for n_ in range(len(expected)))

    if res and temp_model_file not in model_files:  # still keep the failed case files for the diagnosis.
        model_files.append(temp_model_file)

    if not res:
        for n_ in range(len(expected)):
            expected_list = expected[n_].flatten()
            actual_list = actual[n_].flatten()
            print_mismatches(case_name, n_, expected_list, actual_list, rtol, atol)

    return res
Exemplo n.º 4
0
def benchmark_op(repeat=10,
                 number=10,
                 name="Slice",
                 shape_slice_fct=None,
                 save=None,
                 opset=14,
                 repeat_profile=1500,
                 verbose=1):
    if verbose:
        print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d"
              " opset=%d." % (repeat, number, repeat_profile, opset))
    res = []
    for dim in tqdm([
            8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000,
            1024, 1200
    ]):
        shape, slices = shape_slice_fct(dim)
        onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save,
                                                          op_version=opset,
                                                          slices=slices)

        n_arrays = 20
        if dim >= 512:
            n_arrays = 10
        xs = [
            numpy.random.rand(*shape).astype(numpy.float32)
            for _ in range(n_arrays)
        ]
        info = dict(shape=shape)

        ctx = dict(xs=xs, loop_fct=loop_fct)

        # numpy
        ctx['fct'] = npy_fct
        obs = measure_time(lambda: loop_fct(npy_fct, xs),
                           div_by_number=True,
                           context=ctx,
                           repeat=repeat,
                           number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs['shape'] = ",".join(map(str, shape))
        obs['slices'] = str(slices)
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(lambda: loop_fct(ort_fct, xs),
                           div_by_number=True,
                           context=ctx,
                           repeat=repeat,
                           number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs['shape'] = ",".join(map(str, shape))
        obs['slices'] = str(slices)
        obs.update(info)
        res.append(obs)

        if ort_fct_gpu is not None:

            # onnxruntime
            dev = get_ort_device('cuda:0')
            ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs]
            ctx['fct'] = ort_fct_gpu
            obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']),
                               div_by_number=True,
                               context=ctx,
                               repeat=repeat,
                               number=number)
            obs['dim'] = dim
            obs['fct'] = 'ort_gpu'
            obs['shape'] = ",".join(map(str, shape))
            obs['slices'] = str(slices)
            obs.update(info)
            res.append(obs)

    # profiling CPU
    if verbose:
        print("[benchmark_op] done.")
        print("[benchmark_op] profile CPU.")
    so = SessionOptions()
    so.enable_profiling = True
    sess = InferenceSession(onx.SerializeToString(),
                            so,
                            providers=["CPUExecutionProvider"])
    for i in range(0, repeat_profile):
        sess.run(
            None,
            {'X': xs[-1]},
        )
    prof = sess.end_profiling()
    with open(prof, "r") as f:
        js = json.load(f)
    dfprof = DataFrame(OnnxWholeSession.process_profiling(js))
    dfprof['shape'] = ",".join(map(str, shape))
    dfprof['slices'] = str(slices)
    if verbose:
        print("[benchmark_op] done.")

    # profiling CPU
    if ort_fct_gpu is not None:
        if verbose:
            print("[benchmark_op] profile GPU.")
        so = SessionOptions()
        so.enable_profiling = True
        sess = InferenceSession(onx.SerializeToString(),
                                so,
                                providers=["CUDAExecutionProvider"])
        io_binding = sess.io_binding()._iobinding
        device = get_ort_device('cpu')

        for i in range(0, repeat_profile):
            x = ctx['xs'][-1]
            io_binding.bind_input('X', device, numpy.float32, x.shape(),
                                  x.data_ptr())
            io_binding.bind_output('Y', device)
            sess._sess.run_with_iobinding(io_binding, None)

        prof = sess.end_profiling()
        with open(prof, "r") as f:
            js = json.load(f)
        dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js))
        dfprofgpu['shape'] = ",".join(map(str, shape))
        dfprofgpu['slices'] = str(slices)
        if verbose:
            print("[benchmark_op] profile done.")
    else:
        dfprofgpu = None

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    piv = df.pivot('shape', 'fct', 'average')

    rs = piv.copy()
    for c in ['numpy', 'ort', 'ort_gpu']:
        if c in rs.columns:
            rs[f"numpy/{c}"] = rs['numpy'] / rs[c]
    rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy()

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True,
             logy=True,
             ax=ax[0],
             title=f"{name} benchmark\n{shape_name!r} lower better")
    ax[0].legend(prop={"size": 9})
    rs.plot(
        logx=True,
        logy=True,
        ax=ax[1],
        title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better")
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return dfprof, dfprofgpu, df, rs, ax
Exemplo n.º 5
0
                             0)
    provider = 'CPUExecutionProvider'

print(f"provider = {provider!r}")

####################################
# We load the graph.

with open(filename, 'rb') as f:
    onx = onnx.load(f)

###############################
# Create of the session.

so = SessionOptions()
so.enable_profiling = True
so.optimized_model_filepath = os.path.split(filename)[-1] + ".optimized.onnx"
sess = InferenceSession(onx.SerializeToString(), so, providers=[provider])
bind = SessionIOBinding(sess._sess)

print("graph_optimization_level:", so.graph_optimization_level)

#####################################
# Creates random data
feed = random_feed(sess, batch)

#####################################
# moving the data on CPU or GPU
feed_ort_value = OrderedDict(
    (name, (C_OrtValue.ortvalue_from_numpy(v, ort_device), v.dtype))
    for name, v in feed.items())
Exemplo n.º 6
0
def create_onnxruntime_session(
        onnx_model_path,
        use_gpu,
        provider=None,
        enable_all_optimization=True,
        num_threads=-1,
        enable_profiling=False,
        verbose=False,
        provider_options={},  # map execution provider name to its option
):
    session = None
    try:
        from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions

        sess_options = SessionOptions()

        if enable_all_optimization:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        else:
            sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC

        if enable_profiling:
            sess_options.enable_profiling = True

        if num_threads > 0:
            sess_options.intra_op_num_threads = num_threads
            logger.debug(
                f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}"
            )

        if verbose:
            sess_options.log_severity_level = 0
        else:
            sess_options.log_severity_level = 4

        logger.debug(f"Create session for onnx model: {onnx_model_path}")
        if use_gpu:
            if provider == "dml":
                providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
            elif provider == "rocm":
                providers = ["ROCMExecutionProvider", "CPUExecutionProvider"]
            elif provider == "migraphx":
                providers = [
                    "MIGraphXExecutionProvider",
                    "ROCMExecutionProvider",
                    "CPUExecutionProvider",
                ]
            elif provider == "cuda":
                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
            elif provider == "tensorrt":
                providers = [
                    "TensorrtExecutionProvider",
                    "CUDAExecutionProvider",
                    "CPUExecutionProvider",
                ]
            else:
                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
        else:
            providers = ["CPUExecutionProvider"]

        if provider_options:
            providers = [
                (name,
                 provider_options[name]) if name in provider_options else name
                for name in providers
            ]

        session = InferenceSession(onnx_model_path,
                                   sess_options,
                                   providers=providers)
    except:
        logger.error(f"Exception", exc_info=True)

    return session
Exemplo n.º 7
0
def latency(model,
            law='normal',
            size=1,
            number=10,
            repeat=10,
            max_time=0,
            runtime="onnxruntime",
            device='cpu',
            profiling=None):
    """
    Measures the latency of a model (python API).

    :param model: ONNX graph
    :param law: random law used to generate fake inputs
    :param size: batch size, it replaces the first dimension
        of every input if it is left unknown
    :param number: number of calls to measure
    :param repeat: number of times to repeat the experiment
    :param max_time: if it is > 0, it runs as many time during
        that period of time
    :param runtime: available runtime
    :param device: device, `cpu`, `cuda:0`
    :param profiling: if True, profile the execution of every
        node, if can be sorted by name or type,
        the value for this parameter should e in `(None, 'name', 'type')`,
    :return: dictionary or a tuple (dictionary, dataframe)
        if the profiling is enable

    .. cmdref::
        :title: Measures model latency
        :cmd: -m mlprodict latency --help
        :lid: l-cmd-latency

        The command generates random inputs and call many times the
        model on these inputs. It returns the processing time for one
        iteration.

        Example::

            python -m mlprodict latency --model "model.onnx"
    """
    if isinstance(model, str) and not os.path.exists(model):
        raise FileNotFoundError(  # pragma: no cover
            "Unable to find model %r." % model)
    if profiling not in (None, '', 'name', 'type'):
        raise ValueError("Unexpected value for profiling: %r." % profiling)
    size = int(size)
    number = int(number)
    repeat = int(repeat)
    if max_time in (None, 0, ""):
        max_time = None
    else:
        max_time = float(max_time)
        if max_time <= 0:
            max_time = None

    if law != "normal":
        raise ValueError("Only law='normal' is supported, not %r." % law)

    if device in ('cpu', 'CPUExecutionProviders'):
        providers = ['CPUExecutionProviders']
    elif device in ('cuda:0', 'CUDAExecutionProviders'):
        if runtime != 'onnxruntime':
            raise NotImplementedError(  # pragma: no cover
                "Only runtime 'onnxruntime' supports this device or provider "
                "%r." % device)
        providers = ['CUDAExecutionProviders']
    elif ',' in device:
        if runtime != 'onnxruntime':
            raise NotImplementedError(  # pragma: no cover
                "Only runtime 'onnxruntime' supports this device or provider "
                "%r." % device)
        providers = device.split(',')
        allp = set(get_all_providers())
        for p in providers:
            if p not in allp:
                raise ValueError(
                    "One device or provider %r is not supported among %r."
                    "" % (p, allp))
    else:
        raise ValueError(  # pragma no cover
            "Device %r not supported." % device)

    if runtime == "onnxruntime":
        if profiling in ('name', 'type'):
            so = SessionOptions()
            so.enable_profiling = True
            sess = InferenceSession(model, sess_options=so)
        else:
            sess = InferenceSession(model)
        fct = lambda feeds: sess.run(None, feeds)
        inputs = sess.get_inputs()
    else:
        if profiling in ('name', 'type'):
            runtime_options = {"enable_profiling": True}
            if runtime != 'onnxruntime1':
                raise NotImplementedError(  # pragma: no cover
                    "Profiling is not implemented for runtime=%r." % runtime)
        else:
            runtime_options = None
        oinf = OnnxInference(model,
                             runtime=runtime,
                             runtime_options=runtime_options)
        fct = lambda feeds: oinf.run(feeds)
        inputs = oinf.obj.graph.input

    feeds = random_feed(inputs, size)
    res = measure_time(lambda: fct(feeds),
                       number=number,
                       repeat=repeat,
                       context={},
                       max_time=max_time,
                       div_by_number=True)
    for k, v in feeds.items():
        res["shape(%s)" % k] = "x".join(map(str, v.shape))
    if profiling in ('name', 'type'):
        if runtime == 'onnxruntime':
            profile_name = sess.end_profiling()
            with open(profile_name, 'r', encoding='utf-8') as f:
                js = json.load(f)
            js = OnnxWholeSession.process_profiling(js)
            df = DataFrame(js)
        else:
            df = oinf.get_profiling(as_df=True)
        if profiling == 'name':
            gr = df[['dur', "args_op_name",
                     "name"]].groupby(["args_op_name",
                                       "name"]).sum().sort_values('dur')
        else:
            gr = df[['dur', "args_op_name"
                     ]].groupby("args_op_name").sum().sort_values('dur')
        return res, gr

    return res