def test_profile_pyinst(self): def simple(): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) for i in range(0, 99): df2rst(df) return df2rst(df) ps, res = profile(simple, pyinst_format='text') # pylint: disable=W0632 self.assertIn('.py', res) self.assertNotEmpty(ps) ps, res = profile(simple, pyinst_format='textu') # pylint: disable=W0632 self.assertIn('Recorded', res) self.assertNotEmpty(ps) ps, res = profile(simple, pyinst_format='html') # pylint: disable=W0632 self.assertIn("</script>", res) self.assertNotEmpty(ps) self.assertRaise(lambda: profile(simple, pyinst_format='htmlgg'), ValueError) ps, res = profile(simple, pyinst_format='json') # pylint: disable=W0632 self.assertIn('"start_time"', res) self.assertNotEmpty(ps)
def test_profile_df_verbose(self): calls = [0] def f0(t): calls[0] += 1 time.sleep(t) def f1(t): calls[0] += 1 time.sleep(t) def f2(): calls[0] += 1 f1(0.1) f1(0.01) def f3(): calls[0] += 1 f0(0.2) f1(0.5) def f4(): calls[0] += 1 f2() f3() ps = profile(f4)[0] # pylint: disable=W0632 df = self.capture(lambda: profile2df(ps, verbose=True, fLOG=print))[0] dfi = df.set_index('fct') self.assertEqual(dfi.loc['f4', 'ncalls1'], 1) self.assertEqual(dfi.loc['f4', 'ncalls2'], 1)
def test_profile_df(self): def simple(): def simple2(): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) return df2rst(df) return simple2() rootrem = os.path.normpath( os.path.abspath(os.path.join(os.path.dirname(rootfile), '..'))) ps, df = profile(simple, rootrem=rootrem, as_df=True) # pylint: disable=W0632 self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(df.loc[0, 'namefct'].split('-')[-1], 'simple') self.assertNotEmpty(ps) df = profile2df(ps, False) self.assertIsInstance(df, list) self.assertIsInstance(df[0], dict) df = profile2df(ps, True) self.assertIsInstance(df, pandas.DataFrame)
def test_profile(self): def simple(): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) return df2rst(df) rootrem = os.path.normpath( os.path.abspath(os.path.join(os.path.dirname(rootfile), '..'))) ps, res = profile(simple, rootrem=rootrem) # pylint: disable=W0632 res = res.replace('\\', '/') self.assertIn('pyquickhelper/pandashelper/tblformat.py', res) self.assertNotEmpty(ps) ps, res = profile(simple) # pylint: disable=W0632 res = res.replace('\\', '/') self.assertIn('pyquickhelper/pandashelper/tblformat.py', res) self.assertNotEmpty(ps)
def test_profile_graph(self): calls = [0] def f0(t): calls[0] += 1 time.sleep(t) def f1(t): calls[0] += 1 time.sleep(t) def f2(): calls[0] += 1 f1(0.1) f1(0.01) def f3(): calls[0] += 1 f0(0.2) f1(0.5) def f4(): calls[0] += 1 f2() f3() ps = profile(f4)[0] # pylint: disable=W0632 profile2df(ps, verbose=False, clean_text=lambda x: x.split('/')[-1]) root, nodes = profile2graph(ps, clean_text=lambda x: x.split('/')[-1]) self.assertEqual(len(nodes), 6) self.assertIsInstance(nodes, dict) self.assertIsInstance(root, ProfileNode) self.assertIn("(", str(root)) dicts = root.as_dict() self.assertEqual(10, len(dicts)) text = root.to_text() self.assertIn("1 1", text) self.assertIn(' f1', text) text = root.to_text(fct_width=20) self.assertIn('...', text) root.to_text(sort_key=SortKey.CUMULATIVE) root.to_text(sort_key=SortKey.TIME) self.assertRaise(lambda: root.to_text(sort_key=SortKey.NAME), NotImplementedError) js = root.to_json(indent=2) self.assertIn('"details"', js) js = root.to_json(as_str=False) self.assertIsInstance(js, dict)
def test_profile_graph_recursive1(self): def f0(t): if t < 0.1: time.sleep(t) else: f0(t - 0.1) def f4(): f0(0.15) ps = profile(f4)[0] # pylint: disable=W0632 profile2df(ps, verbose=False, clean_text=lambda x: x.split('/')[-1]) root, nodes = profile2graph(ps, clean_text=lambda x: x.split('/')[-1]) self.assertEqual(len(nodes), 3) text = root.to_text() self.assertIn(" f0", text) js = root.to_json(indent=2) self.assertIn('"details"', js)
def test_profile_df(self): def simple(): def simple2(): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) return df2rst(df) return simple2() rootrem = os.path.normpath( os.path.abspath(os.path.join(os.path.dirname(rootfile), '..'))) ps, df = profile(simple, rootrem=rootrem, as_df=True) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(df.loc[0, 'namefct'].split('-')[-1], 'simple2')
def profile_fct_graph(fct, title, highlights=None, nb=20, figsize=(10, 3)): """ Returns a graph which profiles the execution of function *fct*. See :ref:`onnxsklearnconsortiumrst`. """ paths = [ os.path.dirname(sklearn.__file__), "site-packages", os.path.join(sys.prefix, "lib") ] _, df = profile(fct, as_df=True, rootrem=paths) colname = 'namefct' if 'namefct' in df.columns else 'fct' sdf = df[[colname, 'cum_tall']].head(n=nb).set_index(colname) index_list = list(sdf.index) ax = sdf.plot(kind='bar', figsize=figsize, rot=30) ax.set_title(title) for la in ax.get_xticklabels(): la.set_horizontalalignment('right') if highlights: for lab in highlights: if lab not in index_list: new_labs = [ ns for ns in index_list if isinstance(ns, str) and lab in ns ] if len(new_labs) == 0: raise ValueError("Unable to find '{}' in '{}'?".format( lab, ", ".join(sorted(map(str, index_list))))) labs = new_labs else: labs = [lab] for la in labs: pos = sdf.index.get_loc(la) h = 0.15 ax.plot([pos - 0.35, pos - 0.35], [0, h], 'r--') ax.plot([pos + 0.3, pos + 0.3], [0, h], 'r--') ax.plot([pos - 0.35, pos + 0.3], [h, h], 'r--') return ax
################################### # Profiling and comparison with scikit-learn # ++++++++++++++++++++++++++++++++++++++++++ X32 = X.astype(numpy.float32) def runlocaldt(): for i in range(0, 5000): oinf.run({'X': X32[:10]}) dt.predict(X[:10]) print("profiling...") txt = profile(runlocaldt, pyinst_format='text') print(txt[1]) ########################################### # Profiling for AdaBoostRegressor # +++++++++++++++++++++++++++++++ # # The next example shows how long the python runtime # spends in each operator. ada = AdaBoostRegressor() ada.fit(X, y) onx = to_onnx(ada, X[:1].astype(numpy.float32), target_opset=11) oinf = OnnxInference(onx, runtime='python_compiled') print(oinf)
pos = text.find('onnxruntime') if pos >= 0: return text[pos:] pos = text.find('sklearn') if pos >= 0: return text[pos:] pos = text.find('onnxcustom') if pos >= 0: return text[pos:] pos = text.find('site-packages') if pos >= 0: return text[pos:] return text ps = profile(lambda: benchmark(X_train, y_train, nn, train_session, name='NN-CPU'))[0] root, nodes = profile2graph(ps, clean_text=clean_name) text = root.to_text() print(text) ###################################### # if GPU is available # +++++++++++++++++++ if get_device().upper() == 'GPU': train_session = OrtGradientForwardBackwardOptimizer( onx, device='cuda', learning_rate=1e-5, warm_start=False, max_iter=max_iter, batch_size=batch_size) benches.append(benchmark(X_train, y_train, nn,
######################################### # Graph # +++++ fig, ax = plt.subplots(1, 1) piv.plot(title="Time processing of serialization functions\n" "lower better", ax=ax) ax.set_xlabel("onnx size") ax.set_ylabel("s") ########################################### # Conclusion # ++++++++++ # # This graph shows that implementing check_model in python is much slower # than the C++ version. However, protobuf prevents from sharing # ModelProto from Python to C++ (see `Python Updates # <https://developers.google.com/protocol-buffers/docs/news/2022-05-06>`_) # unless the python package is compiled with a specific setting # (problably slower). A profiling shows that the code spends quite some time # in function :func:`getattr`. ps = profile(lambda: check_model_py(onx))[0] root, nodes = profile2graph(ps, clean_text=lambda x: x.split('/')[-1]) text = root.to_text() print(text) # plt.show()
# This step involves :epkg:`pyinstrument` to measure # where the time is spent. Both :epkg:`scikit-learn` # and :epkg:`mlprodict` runtime are called so that # the prediction times can be compared. X32 = X_test.astype(numpy.float32) def runlocal(): for i in range(0, 100): oinf.run({'X': X32[:1000]}) hgb.predict(X_test[:1000]) print("profiling...") txt = profile(runlocal, pyinst_format='text') print(txt[1]) ########################################## # Now let's measure the performance the average # computation time per observations for 2 to 100 # observations. The runtime implemented in # :epkg:`mlprodict` parallizes the computation # after a given number of observations. obs = [] for N in tqdm(list(range(2, 21))): m = measure_time("oinf.run({'X': x})", { 'oinf': oinf, 'x': X32[:N] },
def benchmark(N=1000, n_features=20, hidden_layer_sizes="26,25", max_iter=1000, learning_rate_init=1e-4, batch_size=100, run_torch=True, device='cpu', opset=12, profile='fct'): """ Compares :epkg:`onnxruntime-training` to :epkg:`scikit-learn` for training. Training algorithm is SGD. :param N: number of observations to train on :param n_features: number of features :param hidden_layer_sizes: hidden layer sizes, comma separated values :param max_iter: number of iterations :param learning_rate_init: initial learning rate :param batch_size: batch size :param run_torch: train scikit-learn in the same condition (True) or just walk through one iterator with *scikit-learn* :param device: `'cpu'` or `'cuda'` :param opset: opset to choose for the conversion :param profile: 'fct' to use cProfile, 'event' to use WithEventProfiler """ N = int(N) n_features = int(n_features) max_iter = int(max_iter) learning_rate_init = float(learning_rate_init) batch_size = int(batch_size) run_torch = run_torch in (1, True, '1', 'True') print("N=%d" % N) print("n_features=%d" % n_features) print(f"hidden_layer_sizes={hidden_layer_sizes!r}") print("max_iter=%d" % max_iter) print(f"learning_rate_init={learning_rate_init:f}") print("batch_size=%d" % batch_size) print(f"run_torch={run_torch!r}") print(f"opset={opset!r} (unused)") print(f"device={device!r}") print(f"profile={profile!r}") device0 = device device = torch.device("cuda:0" if device in ('cuda', 'cuda:0', 'gpu') else "cpu") print(f"fixed device={device!r}") print('------------------') if not isinstance(hidden_layer_sizes, tuple): hidden_layer_sizes = tuple(map(int, hidden_layer_sizes.split(","))) X, y = make_regression(N, n_features=n_features, bias=2) X = X.astype(numpy.float32) y = y.astype(numpy.float32) X_train, X_test, y_train, y_test = train_test_split(X, y) class Net(torch.nn.Module): def __init__(self, n_features, hidden, n_output): super(Net, self).__init__() self.hidden = [] size = n_features for i, hid in enumerate(hidden_layer_sizes): self.hidden.append(torch.nn.Linear(size, hid)) size = hid setattr(self, "hid%d" % i, self.hidden[-1]) self.hidden.append(torch.nn.Linear(size, n_output)) setattr(self, "predict", self.hidden[-1]) def forward(self, x): for hid in self.hidden: x = hid(x) x = F.relu(x) return x nn = Net(n_features, hidden_layer_sizes, 1) if device0 == 'cpu': nn.cpu() else: nn.cuda(device=device) print( f"n_parameters={len(list(nn.parameters()))}, n_layers={len(nn.hidden)}" ) for i, p in enumerate(nn.parameters()): print(" p[%d].shape=%r" % (i, p.shape)) optimizer = torch.optim.SGD(nn.parameters(), lr=learning_rate_init) criterion = torch.nn.MSELoss(size_average=False) batch_no = len(X_train) // batch_size # training inputs = torch.tensor(X_train[:1], requires_grad=True, device=device) nn(inputs) def train_torch(): for epoch in range(max_iter): running_loss = 0.0 x, y = shuffle(X_train, y_train) for i in range(batch_no): start = i * batch_size end = start + batch_size inputs = torch.tensor(x[start:end], requires_grad=True, device=device) labels = torch.tensor(y[start:end], requires_grad=True, device=device) def step_torch(): optimizer.zero_grad() outputs = nn(inputs) loss = criterion(outputs, torch.unsqueeze(labels, dim=1)) loss.backward() optimizer.step() return loss loss = step_torch() running_loss += loss.item() return running_loss begin = time.perf_counter() if run_torch: if profile in ('cProfile', 'fct'): from pyquickhelper.pycode.profiling import profile running_loss, prof, _ = profile(train_torch, return_results=True) dur_torch = time.perf_counter() - begin name = f"{device0}.{os.path.split(__file__)[-1]}.tch.prof" prof.dump_stats(name) elif profile == 'event': def clean_name(x): return "/".join(x.replace("\\", "/").split('/')[-3:]) prof = WithEventProfiler(size=10000000, clean_file_name=clean_name) with prof: running_loss = train_torch() dur_torch = time.perf_counter() - begin df = prof.report name = f"{device0}.{os.path.split(__file__)[-1]}.tch.csv" df.to_csv(name, index=False) else: running_loss = train_torch() dur_torch = time.perf_counter() - begin else: dur_torch = time.perf_counter() - begin if run_torch: print(f"time_torch={dur_torch!r}, running_loss={running_loss!r}") running_loss0 = running_loss else: running_loss0 = -1 # ORTModule nn = Net(n_features, hidden_layer_sizes, 1) if device0 == 'cpu': nn.cpu() else: nn.cuda(device=device) nn_ort = ORTModule(nn) optimizer = torch.optim.SGD(nn_ort.parameters(), lr=learning_rate_init) criterion = torch.nn.MSELoss(size_average=False) # exclude onnx conversion inputs = torch.tensor(X_train[:1], requires_grad=True, device=device) nn_ort(inputs) def train_ort(): for epoch in range(max_iter): running_loss = 0.0 x, y = shuffle(X_train, y_train) for i in range(batch_no): start = i * batch_size end = start + batch_size inputs = torch.tensor(x[start:end], requires_grad=True, device=device) labels = torch.tensor(y[start:end], requires_grad=True, device=device) def step_ort(): optimizer.zero_grad() outputs = nn_ort(inputs) loss = criterion(outputs, torch.unsqueeze(labels, dim=1)) loss.backward() optimizer.step() return loss loss = step_ort() running_loss += loss.item() return running_loss begin = time.perf_counter() if profile in ('cProfile', 'fct'): from pyquickhelper.pycode.profiling import profile running_loss, prof, _ = profile(train_ort, return_results=True) dur_ort = time.perf_counter() - begin name = f"{device0}.{os.path.split(__file__)[-1]}.ort.prof" prof.dump_stats(name) elif profile == 'event': def clean_name(x): return "/".join(x.replace("\\", "/").split('/')[-3:]) prof = WithEventProfiler(size=10000000, clean_file_name=clean_name) with prof: running_loss = train_ort() dur_ort = time.perf_counter() - begin df = prof.report name = f"{device0}.{os.path.split(__file__)[-1]}.ort.csv" df.to_csv(name, index=False) else: running_loss = train_ort() dur_ort = time.perf_counter() - begin print(f"time_torch={dur_torch!r}, running_loss={running_loss0!r}") print(f"time_ort={dur_ort!r}, last_trained_error={running_loss!r}")
################################# # Training. for name, model in tqdm(models): model.fit(data) ################################# # Profiling of runtime `onnxruntime1`. def fct(): for i in range(1000): models[2][1].transform(data) res = profile(fct, pyinst_format="text") print(res[1]) ################################# # Profiling of runtime `numpy`. def fct(): for i in range(1000): models[3][1].transform(data) res = profile(fct, pyinst_format="text") print(res[1]) #################################
def test_profile_stat_gr(self): def f0(t): time.sleep(t) def f1(t): time.sleep(t) def f2(): f1(0.1) f1(0.01) def f3(): f0(0.2) f1(0.5) def f4(): f2() f3() ps = profile(f4)[0] # pylint: disable=W0632 ps.dump_stats("temp_gr_stat.prof") with self.subTest(calls=False, output=None): st = BufferedPrint() main(args=[ 'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1' ], fLOG=st.fprint) self.assertIn('+++', str(st)) with self.subTest(calls=False, output="txt"): st = BufferedPrint() main(args=[ 'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1', '-o', 'temp_gr_output.txt' ], fLOG=st.fprint) with open("temp_gr_output.txt", "r", encoding='utf-8') as f: content = f.read() self.assertIn('+++', str(st)) self.assertIn('+++', content) with self.subTest(calls=False, output='csv'): st = BufferedPrint() main(args=[ 'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1', '-o', 'temp_gr_output.csv' ], fLOG=st.fprint) with open("temp_gr_output.csv", "r", encoding='utf-8') as f: content = f.read() self.assertIn('+++', str(st)) self.assertIn(',+', content) with self.subTest(calls=False, output='xlsx'): st = BufferedPrint() main(args=[ 'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1', '-o', 'temp_gr_output.xlsx' ], fLOG=st.fprint) self.assertIn('+++', str(st)) self.assertExists('temp_gr_output.xlsx')
# C++ implementation vs numpy # +++++++++++++++++++++++++++ # # :epkg:`scikit-learn` uses :epkg:`numpy` to compute the top *k* elements. res = benchmark(X32, lambda x: topk_sorted_implementation(x, 5, 1, 0), lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0), N=N) res ########################################### # It seems to be faster too. Let's profile. xr = randn(1000000, 100) text = profile(lambda: topk_sorted_implementation(xr, 5, 1, 0), pyinst_format='text')[1] print(text) #################################### # Parallelisation # +++++++++++++++ # # We need to disable the parallelisation to # really compare both implementation. # In[11]: def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10): res = {} for k in tqdm(K):