def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False, use_dml=False): session = None try: from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") if use_gpu: if use_dml: execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider'] else: execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] else: execution_providers = ['CPUExecutionProvider'] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def create_onnxruntime_session(onnx_model_path, use_gpu, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False): session = None try: from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") elif (not use_gpu) and (version.parse(onnxruntime_version) < version.parse('1.3.0')): # Set intra_op_num_threads = 1 to enable OpenMP for onnxruntime 1.2.0 (cpu) # onnxruntime-gpu is not built with openmp so it is better to use default (0) or cpu_count instead. sess_options.intra_op_num_threads = 1 if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") execution_providers = ['CPUExecutionProvider' ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) except: logger.error(f"Exception", exc_info=True) return session
def run_onnx_runtime(case_name, onnx_model, data, expected, model_files, rtol=1.e-3, atol=1.e-6, compare_perf=False, enable_profiling=False): if not os.path.exists(tmp_path): os.mkdir(tmp_path) temp_model_file = os.path.join(tmp_path, 'temp_' + case_name + '.onnx') onnx.save_model(onnx_model, temp_model_file) try: import onnxruntime if enable_profiling: from onnxruntime import SessionOptions sess_options = SessionOptions() sess_options.enable_profiling = True sess = onnxruntime.InferenceSession(temp_model_file, sess_options) else: sess = onnxruntime.InferenceSession(temp_model_file) except ImportError: mock_keras2onnx.common.k2o_logger().warning("Cannot import ONNXRuntime!") return True if isinstance(data, dict): feed_input = data else: data = data if isinstance(data, list) else [data] input_names = sess.get_inputs() # to avoid too complicated test code, we restrict the input name in Keras test cases must be # in alphabetical order. It's always true unless there is any trick preventing that. feed = zip(sorted(i_.name for i_ in input_names), data) feed_input = dict(feed) actual = sess.run(None, feed_input) if compare_perf: count = 10 time_start = time.time() for i in range(count): sess.run(None, feed_input) time_end = time.time() print('avg ort time =' + str((time_end - time_start)/count)) if enable_profiling: profile_file = sess.end_profiling() profile_records = load_profile_json(profile_file) lines = parse_profile_results(profile_records) print("Results:") print("-" * 64) for line in lines: print(line) if expected is None: return if isinstance(expected, tuple): expected = list(expected) elif not isinstance(expected, list): expected = [expected] res = all(np.allclose(expected[n_], actual[n_], rtol=rtol, atol=atol) for n_ in range(len(expected))) if res and temp_model_file not in model_files: # still keep the failed case files for the diagnosis. model_files.append(temp_model_file) if not res: for n_ in range(len(expected)): expected_list = expected[n_].flatten() actual_list = actual[n_].flatten() print_mismatches(case_name, n_, expected_list, actual_list, rtol, atol) return res
def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None, save=None, opset=14, repeat_profile=1500, verbose=1): if verbose: print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d" " opset=%d." % (repeat, number, repeat_profile, opset)) res = [] for dim in tqdm([ 8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000, 1024, 1200 ]): shape, slices = shape_slice_fct(dim) onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save, op_version=opset, slices=slices) n_arrays = 20 if dim >= 512: n_arrays = 10 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] info = dict(shape=shape) ctx = dict(xs=xs, loop_fct=loop_fct) # numpy ctx['fct'] = npy_fct obs = measure_time(lambda: loop_fct(npy_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time(lambda: loop_fct(ort_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) if ort_fct_gpu is not None: # onnxruntime dev = get_ort_device('cuda:0') ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs] ctx['fct'] = ort_fct_gpu obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_gpu' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # profiling CPU if verbose: print("[benchmark_op] done.") print("[benchmark_op] profile CPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"]) for i in range(0, repeat_profile): sess.run( None, {'X': xs[-1]}, ) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprof = DataFrame(OnnxWholeSession.process_profiling(js)) dfprof['shape'] = ",".join(map(str, shape)) dfprof['slices'] = str(slices) if verbose: print("[benchmark_op] done.") # profiling CPU if ort_fct_gpu is not None: if verbose: print("[benchmark_op] profile GPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CUDAExecutionProvider"]) io_binding = sess.io_binding()._iobinding device = get_ort_device('cpu') for i in range(0, repeat_profile): x = ctx['xs'][-1] io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) sess._sess.run_with_iobinding(io_binding, None) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js)) dfprofgpu['shape'] = ",".join(map(str, shape)) dfprofgpu['slices'] = str(slices) if verbose: print("[benchmark_op] profile done.") else: dfprofgpu = None # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) piv = df.pivot('shape', 'fct', 'average') rs = piv.copy() for c in ['numpy', 'ort', 'ort_gpu']: if c in rs.columns: rs[f"numpy/{c}"] = rs['numpy'] / rs[c] rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy() # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title=f"{name} benchmark\n{shape_name!r} lower better") ax[0].legend(prop={"size": 9}) rs.plot( logx=True, logy=True, ax=ax[1], title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better") ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return dfprof, dfprofgpu, df, rs, ax
0) provider = 'CPUExecutionProvider' print(f"provider = {provider!r}") #################################### # We load the graph. with open(filename, 'rb') as f: onx = onnx.load(f) ############################### # Create of the session. so = SessionOptions() so.enable_profiling = True so.optimized_model_filepath = os.path.split(filename)[-1] + ".optimized.onnx" sess = InferenceSession(onx.SerializeToString(), so, providers=[provider]) bind = SessionIOBinding(sess._sess) print("graph_optimization_level:", so.graph_optimization_level) ##################################### # Creates random data feed = random_feed(sess, batch) ##################################### # moving the data on CPU or GPU feed_ort_value = OrderedDict( (name, (C_OrtValue.ortvalue_from_numpy(v, ort_device), v.dtype)) for name, v in feed.items())
def create_onnxruntime_session( onnx_model_path, use_gpu, provider=None, enable_all_optimization=True, num_threads=-1, enable_profiling=False, verbose=False, provider_options={}, # map execution provider name to its option ): session = None try: from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions sess_options = SessionOptions() if enable_all_optimization: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL else: sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC if enable_profiling: sess_options.enable_profiling = True if num_threads > 0: sess_options.intra_op_num_threads = num_threads logger.debug( f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) if verbose: sess_options.log_severity_level = 0 else: sess_options.log_severity_level = 4 logger.debug(f"Create session for onnx model: {onnx_model_path}") if use_gpu: if provider == "dml": providers = ["DmlExecutionProvider", "CPUExecutionProvider"] elif provider == "rocm": providers = ["ROCMExecutionProvider", "CPUExecutionProvider"] elif provider == "migraphx": providers = [ "MIGraphXExecutionProvider", "ROCMExecutionProvider", "CPUExecutionProvider", ] elif provider == "cuda": providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] elif provider == "tensorrt": providers = [ "TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider", ] else: providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: providers = ["CPUExecutionProvider"] if provider_options: providers = [ (name, provider_options[name]) if name in provider_options else name for name in providers ] session = InferenceSession(onnx_model_path, sess_options, providers=providers) except: logger.error(f"Exception", exc_info=True) return session
def latency(model, law='normal', size=1, number=10, repeat=10, max_time=0, runtime="onnxruntime", device='cpu', profiling=None): """ Measures the latency of a model (python API). :param model: ONNX graph :param law: random law used to generate fake inputs :param size: batch size, it replaces the first dimension of every input if it is left unknown :param number: number of calls to measure :param repeat: number of times to repeat the experiment :param max_time: if it is > 0, it runs as many time during that period of time :param runtime: available runtime :param device: device, `cpu`, `cuda:0` :param profiling: if True, profile the execution of every node, if can be sorted by name or type, the value for this parameter should e in `(None, 'name', 'type')`, :return: dictionary or a tuple (dictionary, dataframe) if the profiling is enable .. cmdref:: :title: Measures model latency :cmd: -m mlprodict latency --help :lid: l-cmd-latency The command generates random inputs and call many times the model on these inputs. It returns the processing time for one iteration. Example:: python -m mlprodict latency --model "model.onnx" """ if isinstance(model, str) and not os.path.exists(model): raise FileNotFoundError( # pragma: no cover "Unable to find model %r." % model) if profiling not in (None, '', 'name', 'type'): raise ValueError("Unexpected value for profiling: %r." % profiling) size = int(size) number = int(number) repeat = int(repeat) if max_time in (None, 0, ""): max_time = None else: max_time = float(max_time) if max_time <= 0: max_time = None if law != "normal": raise ValueError("Only law='normal' is supported, not %r." % law) if device in ('cpu', 'CPUExecutionProviders'): providers = ['CPUExecutionProviders'] elif device in ('cuda:0', 'CUDAExecutionProviders'): if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = ['CUDAExecutionProviders'] elif ',' in device: if runtime != 'onnxruntime': raise NotImplementedError( # pragma: no cover "Only runtime 'onnxruntime' supports this device or provider " "%r." % device) providers = device.split(',') allp = set(get_all_providers()) for p in providers: if p not in allp: raise ValueError( "One device or provider %r is not supported among %r." "" % (p, allp)) else: raise ValueError( # pragma no cover "Device %r not supported." % device) if runtime == "onnxruntime": if profiling in ('name', 'type'): so = SessionOptions() so.enable_profiling = True sess = InferenceSession(model, sess_options=so) else: sess = InferenceSession(model) fct = lambda feeds: sess.run(None, feeds) inputs = sess.get_inputs() else: if profiling in ('name', 'type'): runtime_options = {"enable_profiling": True} if runtime != 'onnxruntime1': raise NotImplementedError( # pragma: no cover "Profiling is not implemented for runtime=%r." % runtime) else: runtime_options = None oinf = OnnxInference(model, runtime=runtime, runtime_options=runtime_options) fct = lambda feeds: oinf.run(feeds) inputs = oinf.obj.graph.input feeds = random_feed(inputs, size) res = measure_time(lambda: fct(feeds), number=number, repeat=repeat, context={}, max_time=max_time, div_by_number=True) for k, v in feeds.items(): res["shape(%s)" % k] = "x".join(map(str, v.shape)) if profiling in ('name', 'type'): if runtime == 'onnxruntime': profile_name = sess.end_profiling() with open(profile_name, 'r', encoding='utf-8') as f: js = json.load(f) js = OnnxWholeSession.process_profiling(js) df = DataFrame(js) else: df = oinf.get_profiling(as_df=True) if profiling == 'name': gr = df[['dur', "args_op_name", "name"]].groupby(["args_op_name", "name"]).sum().sort_values('dur') else: gr = df[['dur', "args_op_name" ]].groupby("args_op_name").sum().sort_values('dur') return res, gr return res