def test_profile_pyinst(self):
        def simple():
            df = pandas.DataFrame([{
                "A": "x",
                "AA": "xx",
                "AAA": "xxx"
            }, {
                "AA": "xxxxxxx",
                "AAA": "xxx"
            }])
            for i in range(0, 99):
                df2rst(df)
            return df2rst(df)

        ps, res = profile(simple, pyinst_format='text')  # pylint: disable=W0632
        self.assertIn('.py', res)
        self.assertNotEmpty(ps)
        ps, res = profile(simple, pyinst_format='textu')  # pylint: disable=W0632
        self.assertIn('Recorded', res)
        self.assertNotEmpty(ps)
        ps, res = profile(simple, pyinst_format='html')  # pylint: disable=W0632
        self.assertIn("</script>", res)
        self.assertNotEmpty(ps)
        self.assertRaise(lambda: profile(simple, pyinst_format='htmlgg'),
                         ValueError)
        ps, res = profile(simple, pyinst_format='json')  # pylint: disable=W0632
        self.assertIn('"start_time"', res)
        self.assertNotEmpty(ps)
    def test_profile_df_verbose(self):
        calls = [0]

        def f0(t):
            calls[0] += 1
            time.sleep(t)

        def f1(t):
            calls[0] += 1
            time.sleep(t)

        def f2():
            calls[0] += 1
            f1(0.1)
            f1(0.01)

        def f3():
            calls[0] += 1
            f0(0.2)
            f1(0.5)

        def f4():
            calls[0] += 1
            f2()
            f3()

        ps = profile(f4)[0]  # pylint: disable=W0632
        df = self.capture(lambda: profile2df(ps, verbose=True, fLOG=print))[0]
        dfi = df.set_index('fct')
        self.assertEqual(dfi.loc['f4', 'ncalls1'], 1)
        self.assertEqual(dfi.loc['f4', 'ncalls2'], 1)
    def test_profile_df(self):
        def simple():
            def simple2():
                df = pandas.DataFrame([{
                    "A": "x",
                    "AA": "xx",
                    "AAA": "xxx"
                }, {
                    "AA": "xxxxxxx",
                    "AAA": "xxx"
                }])
                return df2rst(df)

            return simple2()

        rootrem = os.path.normpath(
            os.path.abspath(os.path.join(os.path.dirname(rootfile), '..')))
        ps, df = profile(simple, rootrem=rootrem, as_df=True)  # pylint: disable=W0632
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(df.loc[0, 'namefct'].split('-')[-1], 'simple')
        self.assertNotEmpty(ps)
        df = profile2df(ps, False)
        self.assertIsInstance(df, list)
        self.assertIsInstance(df[0], dict)
        df = profile2df(ps, True)
        self.assertIsInstance(df, pandas.DataFrame)
    def test_profile(self):
        def simple():
            df = pandas.DataFrame([{
                "A": "x",
                "AA": "xx",
                "AAA": "xxx"
            }, {
                "AA": "xxxxxxx",
                "AAA": "xxx"
            }])
            return df2rst(df)

        rootrem = os.path.normpath(
            os.path.abspath(os.path.join(os.path.dirname(rootfile), '..')))
        ps, res = profile(simple, rootrem=rootrem)  # pylint: disable=W0632
        res = res.replace('\\', '/')
        self.assertIn('pyquickhelper/pandashelper/tblformat.py', res)
        self.assertNotEmpty(ps)

        ps, res = profile(simple)  # pylint: disable=W0632
        res = res.replace('\\', '/')
        self.assertIn('pyquickhelper/pandashelper/tblformat.py', res)
        self.assertNotEmpty(ps)
    def test_profile_graph(self):
        calls = [0]

        def f0(t):
            calls[0] += 1
            time.sleep(t)

        def f1(t):
            calls[0] += 1
            time.sleep(t)

        def f2():
            calls[0] += 1
            f1(0.1)
            f1(0.01)

        def f3():
            calls[0] += 1
            f0(0.2)
            f1(0.5)

        def f4():
            calls[0] += 1
            f2()
            f3()

        ps = profile(f4)[0]  # pylint: disable=W0632
        profile2df(ps, verbose=False, clean_text=lambda x: x.split('/')[-1])
        root, nodes = profile2graph(ps, clean_text=lambda x: x.split('/')[-1])
        self.assertEqual(len(nodes), 6)
        self.assertIsInstance(nodes, dict)
        self.assertIsInstance(root, ProfileNode)
        self.assertIn("(", str(root))
        dicts = root.as_dict()
        self.assertEqual(10, len(dicts))
        text = root.to_text()
        self.assertIn("1  1", text)
        self.assertIn('        f1', text)
        text = root.to_text(fct_width=20)
        self.assertIn('...', text)
        root.to_text(sort_key=SortKey.CUMULATIVE)
        root.to_text(sort_key=SortKey.TIME)
        self.assertRaise(lambda: root.to_text(sort_key=SortKey.NAME),
                         NotImplementedError)
        js = root.to_json(indent=2)
        self.assertIn('"details"', js)
        js = root.to_json(as_str=False)
        self.assertIsInstance(js, dict)
    def test_profile_graph_recursive1(self):
        def f0(t):
            if t < 0.1:
                time.sleep(t)
            else:
                f0(t - 0.1)

        def f4():
            f0(0.15)

        ps = profile(f4)[0]  # pylint: disable=W0632
        profile2df(ps, verbose=False, clean_text=lambda x: x.split('/')[-1])
        root, nodes = profile2graph(ps, clean_text=lambda x: x.split('/')[-1])
        self.assertEqual(len(nodes), 3)
        text = root.to_text()
        self.assertIn("    f0", text)
        js = root.to_json(indent=2)
        self.assertIn('"details"', js)
示例#7
0
    def test_profile_df(self):
        def simple():
            def simple2():
                df = pandas.DataFrame([{
                    "A": "x",
                    "AA": "xx",
                    "AAA": "xxx"
                }, {
                    "AA": "xxxxxxx",
                    "AAA": "xxx"
                }])
                return df2rst(df)

            return simple2()

        rootrem = os.path.normpath(
            os.path.abspath(os.path.join(os.path.dirname(rootfile), '..')))
        ps, df = profile(simple, rootrem=rootrem, as_df=True)
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(df.loc[0, 'namefct'].split('-')[-1], 'simple2')
示例#8
0
def profile_fct_graph(fct, title, highlights=None, nb=20, figsize=(10, 3)):
    """
    Returns a graph which profiles the execution of function *fct*.
    See :ref:`onnxsklearnconsortiumrst`.
    """
    paths = [
        os.path.dirname(sklearn.__file__), "site-packages",
        os.path.join(sys.prefix, "lib")
    ]
    _, df = profile(fct, as_df=True, rootrem=paths)
    colname = 'namefct' if 'namefct' in df.columns else 'fct'
    sdf = df[[colname, 'cum_tall']].head(n=nb).set_index(colname)
    index_list = list(sdf.index)
    ax = sdf.plot(kind='bar', figsize=figsize, rot=30)
    ax.set_title(title)
    for la in ax.get_xticklabels():
        la.set_horizontalalignment('right')
    if highlights:
        for lab in highlights:
            if lab not in index_list:
                new_labs = [
                    ns for ns in index_list
                    if isinstance(ns, str) and lab in ns
                ]
                if len(new_labs) == 0:
                    raise ValueError("Unable to find '{}' in '{}'?".format(
                        lab, ", ".join(sorted(map(str, index_list)))))
                labs = new_labs
            else:
                labs = [lab]
            for la in labs:
                pos = sdf.index.get_loc(la)
                h = 0.15
                ax.plot([pos - 0.35, pos - 0.35], [0, h], 'r--')
                ax.plot([pos + 0.3, pos + 0.3], [0, h], 'r--')
                ax.plot([pos - 0.35, pos + 0.3], [h, h], 'r--')
    return ax
示例#9
0
###################################
# Profiling and comparison with scikit-learn
# ++++++++++++++++++++++++++++++++++++++++++

X32 = X.astype(numpy.float32)


def runlocaldt():
    for i in range(0, 5000):
        oinf.run({'X': X32[:10]})
        dt.predict(X[:10])


print("profiling...")
txt = profile(runlocaldt, pyinst_format='text')
print(txt[1])

###########################################
# Profiling for AdaBoostRegressor
# +++++++++++++++++++++++++++++++
#
# The next example shows how long the python runtime
# spends in each operator.

ada = AdaBoostRegressor()
ada.fit(X, y)
onx = to_onnx(ada, X[:1].astype(numpy.float32), target_opset=11)
oinf = OnnxInference(onx, runtime='python_compiled')
print(oinf)
示例#10
0
    pos = text.find('onnxruntime')
    if pos >= 0:
        return text[pos:]
    pos = text.find('sklearn')
    if pos >= 0:
        return text[pos:]
    pos = text.find('onnxcustom')
    if pos >= 0:
        return text[pos:]
    pos = text.find('site-packages')
    if pos >= 0:
        return text[pos:]
    return text


ps = profile(lambda: benchmark(X_train, y_train,
             nn, train_session, name='NN-CPU'))[0]
root, nodes = profile2graph(ps, clean_text=clean_name)
text = root.to_text()
print(text)

######################################
# if GPU is available
# +++++++++++++++++++

if get_device().upper() == 'GPU':

    train_session = OrtGradientForwardBackwardOptimizer(
        onx, device='cuda', learning_rate=1e-5,
        warm_start=False, max_iter=max_iter, batch_size=batch_size)

    benches.append(benchmark(X_train, y_train, nn,
#########################################
# Graph
# +++++

fig, ax = plt.subplots(1, 1)
piv.plot(title="Time processing of serialization functions\n"
         "lower better",
         ax=ax)
ax.set_xlabel("onnx size")
ax.set_ylabel("s")

###########################################
# Conclusion
# ++++++++++
#
# This graph shows that implementing check_model in python is much slower
# than the C++ version. However, protobuf prevents from sharing
# ModelProto from Python to C++ (see `Python Updates
# <https://developers.google.com/protocol-buffers/docs/news/2022-05-06>`_)
# unless the python package is compiled with a specific setting
# (problably slower). A profiling shows that the code spends quite some time
# in function :func:`getattr`.

ps = profile(lambda: check_model_py(onx))[0]
root, nodes = profile2graph(ps, clean_text=lambda x: x.split('/')[-1])
text = root.to_text()
print(text)

# plt.show()
示例#12
0
# This step involves :epkg:`pyinstrument` to measure
# where the time is spent. Both :epkg:`scikit-learn`
# and :epkg:`mlprodict` runtime are called so that
# the prediction times can be compared.

X32 = X_test.astype(numpy.float32)


def runlocal():
    for i in range(0, 100):
        oinf.run({'X': X32[:1000]})
        hgb.predict(X_test[:1000])


print("profiling...")
txt = profile(runlocal, pyinst_format='text')
print(txt[1])

##########################################
# Now let's measure the performance the average
# computation time per observations for 2 to 100
# observations. The runtime implemented in
# :epkg:`mlprodict` parallizes the computation
# after a given number of observations.

obs = []
for N in tqdm(list(range(2, 21))):
    m = measure_time("oinf.run({'X': x})", {
        'oinf': oinf,
        'x': X32[:N]
    },
def benchmark(N=1000,
              n_features=20,
              hidden_layer_sizes="26,25",
              max_iter=1000,
              learning_rate_init=1e-4,
              batch_size=100,
              run_torch=True,
              device='cpu',
              opset=12,
              profile='fct'):
    """
    Compares :epkg:`onnxruntime-training` to :epkg:`scikit-learn` for
    training. Training algorithm is SGD.

    :param N: number of observations to train on
    :param n_features: number of features
    :param hidden_layer_sizes: hidden layer sizes, comma separated values
    :param max_iter: number of iterations
    :param learning_rate_init: initial learning rate
    :param batch_size: batch size
    :param run_torch: train scikit-learn in the same condition (True) or
        just walk through one iterator with *scikit-learn*
    :param device: `'cpu'` or `'cuda'`
    :param opset: opset to choose for the conversion
    :param profile: 'fct' to use cProfile, 'event' to use WithEventProfiler
    """
    N = int(N)
    n_features = int(n_features)
    max_iter = int(max_iter)
    learning_rate_init = float(learning_rate_init)
    batch_size = int(batch_size)
    run_torch = run_torch in (1, True, '1', 'True')

    print("N=%d" % N)
    print("n_features=%d" % n_features)
    print(f"hidden_layer_sizes={hidden_layer_sizes!r}")
    print("max_iter=%d" % max_iter)
    print(f"learning_rate_init={learning_rate_init:f}")
    print("batch_size=%d" % batch_size)
    print(f"run_torch={run_torch!r}")
    print(f"opset={opset!r} (unused)")
    print(f"device={device!r}")
    print(f"profile={profile!r}")
    device0 = device
    device = torch.device("cuda:0" if device in ('cuda', 'cuda:0',
                                                 'gpu') else "cpu")
    print(f"fixed device={device!r}")
    print('------------------')

    if not isinstance(hidden_layer_sizes, tuple):
        hidden_layer_sizes = tuple(map(int, hidden_layer_sizes.split(",")))
    X, y = make_regression(N, n_features=n_features, bias=2)
    X = X.astype(numpy.float32)
    y = y.astype(numpy.float32)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    class Net(torch.nn.Module):
        def __init__(self, n_features, hidden, n_output):
            super(Net, self).__init__()
            self.hidden = []

            size = n_features
            for i, hid in enumerate(hidden_layer_sizes):
                self.hidden.append(torch.nn.Linear(size, hid))
                size = hid
                setattr(self, "hid%d" % i, self.hidden[-1])
            self.hidden.append(torch.nn.Linear(size, n_output))
            setattr(self, "predict", self.hidden[-1])

        def forward(self, x):
            for hid in self.hidden:
                x = hid(x)
                x = F.relu(x)
            return x

    nn = Net(n_features, hidden_layer_sizes, 1)
    if device0 == 'cpu':
        nn.cpu()
    else:
        nn.cuda(device=device)
    print(
        f"n_parameters={len(list(nn.parameters()))}, n_layers={len(nn.hidden)}"
    )
    for i, p in enumerate(nn.parameters()):
        print("  p[%d].shape=%r" % (i, p.shape))

    optimizer = torch.optim.SGD(nn.parameters(), lr=learning_rate_init)
    criterion = torch.nn.MSELoss(size_average=False)
    batch_no = len(X_train) // batch_size

    # training
    inputs = torch.tensor(X_train[:1], requires_grad=True, device=device)
    nn(inputs)

    def train_torch():
        for epoch in range(max_iter):
            running_loss = 0.0
            x, y = shuffle(X_train, y_train)
            for i in range(batch_no):
                start = i * batch_size
                end = start + batch_size
                inputs = torch.tensor(x[start:end],
                                      requires_grad=True,
                                      device=device)
                labels = torch.tensor(y[start:end],
                                      requires_grad=True,
                                      device=device)

                def step_torch():
                    optimizer.zero_grad()
                    outputs = nn(inputs)
                    loss = criterion(outputs, torch.unsqueeze(labels, dim=1))
                    loss.backward()
                    optimizer.step()
                    return loss

                loss = step_torch()
                running_loss += loss.item()
        return running_loss

    begin = time.perf_counter()
    if run_torch:
        if profile in ('cProfile', 'fct'):
            from pyquickhelper.pycode.profiling import profile
            running_loss, prof, _ = profile(train_torch, return_results=True)
            dur_torch = time.perf_counter() - begin
            name = f"{device0}.{os.path.split(__file__)[-1]}.tch.prof"
            prof.dump_stats(name)
        elif profile == 'event':

            def clean_name(x):
                return "/".join(x.replace("\\", "/").split('/')[-3:])

            prof = WithEventProfiler(size=10000000, clean_file_name=clean_name)
            with prof:
                running_loss = train_torch()
            dur_torch = time.perf_counter() - begin
            df = prof.report
            name = f"{device0}.{os.path.split(__file__)[-1]}.tch.csv"
            df.to_csv(name, index=False)
        else:
            running_loss = train_torch()
            dur_torch = time.perf_counter() - begin
    else:
        dur_torch = time.perf_counter() - begin

    if run_torch:
        print(f"time_torch={dur_torch!r}, running_loss={running_loss!r}")
        running_loss0 = running_loss
    else:
        running_loss0 = -1

    # ORTModule
    nn = Net(n_features, hidden_layer_sizes, 1)
    if device0 == 'cpu':
        nn.cpu()
    else:
        nn.cuda(device=device)

    nn_ort = ORTModule(nn)
    optimizer = torch.optim.SGD(nn_ort.parameters(), lr=learning_rate_init)
    criterion = torch.nn.MSELoss(size_average=False)

    # exclude onnx conversion
    inputs = torch.tensor(X_train[:1], requires_grad=True, device=device)
    nn_ort(inputs)

    def train_ort():
        for epoch in range(max_iter):
            running_loss = 0.0
            x, y = shuffle(X_train, y_train)
            for i in range(batch_no):
                start = i * batch_size
                end = start + batch_size
                inputs = torch.tensor(x[start:end],
                                      requires_grad=True,
                                      device=device)
                labels = torch.tensor(y[start:end],
                                      requires_grad=True,
                                      device=device)

                def step_ort():
                    optimizer.zero_grad()
                    outputs = nn_ort(inputs)
                    loss = criterion(outputs, torch.unsqueeze(labels, dim=1))
                    loss.backward()
                    optimizer.step()
                    return loss

                loss = step_ort()
                running_loss += loss.item()
        return running_loss

    begin = time.perf_counter()
    if profile in ('cProfile', 'fct'):
        from pyquickhelper.pycode.profiling import profile
        running_loss, prof, _ = profile(train_ort, return_results=True)
        dur_ort = time.perf_counter() - begin
        name = f"{device0}.{os.path.split(__file__)[-1]}.ort.prof"
        prof.dump_stats(name)
    elif profile == 'event':

        def clean_name(x):
            return "/".join(x.replace("\\", "/").split('/')[-3:])

        prof = WithEventProfiler(size=10000000, clean_file_name=clean_name)
        with prof:
            running_loss = train_ort()
        dur_ort = time.perf_counter() - begin
        df = prof.report
        name = f"{device0}.{os.path.split(__file__)[-1]}.ort.csv"
        df.to_csv(name, index=False)
    else:
        running_loss = train_ort()
        dur_ort = time.perf_counter() - begin

    print(f"time_torch={dur_torch!r}, running_loss={running_loss0!r}")
    print(f"time_ort={dur_ort!r}, last_trained_error={running_loss!r}")
示例#14
0
#################################
# Training.

for name, model in tqdm(models):
    model.fit(data)

#################################
# Profiling of runtime `onnxruntime1`.


def fct():
    for i in range(1000):
        models[2][1].transform(data)


res = profile(fct, pyinst_format="text")
print(res[1])

#################################
# Profiling of runtime `numpy`.


def fct():
    for i in range(1000):
        models[3][1].transform(data)


res = profile(fct, pyinst_format="text")
print(res[1])

#################################
示例#15
0
    def test_profile_stat_gr(self):
        def f0(t):
            time.sleep(t)

        def f1(t):
            time.sleep(t)

        def f2():
            f1(0.1)
            f1(0.01)

        def f3():
            f0(0.2)
            f1(0.5)

        def f4():
            f2()
            f3()

        ps = profile(f4)[0]  # pylint: disable=W0632
        ps.dump_stats("temp_gr_stat.prof")

        with self.subTest(calls=False, output=None):
            st = BufferedPrint()
            main(args=[
                'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1'
            ],
                 fLOG=st.fprint)
            self.assertIn('+++', str(st))

        with self.subTest(calls=False, output="txt"):
            st = BufferedPrint()
            main(args=[
                'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1',
                '-o', 'temp_gr_output.txt'
            ],
                 fLOG=st.fprint)
            with open("temp_gr_output.txt", "r", encoding='utf-8') as f:
                content = f.read()
            self.assertIn('+++', str(st))
            self.assertIn('+++', content)

        with self.subTest(calls=False, output='csv'):
            st = BufferedPrint()
            main(args=[
                'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1',
                '-o', 'temp_gr_output.csv'
            ],
                 fLOG=st.fprint)
            with open("temp_gr_output.csv", "r", encoding='utf-8') as f:
                content = f.read()
            self.assertIn('+++', str(st))
            self.assertIn(',+', content)

        with self.subTest(calls=False, output='xlsx'):
            st = BufferedPrint()
            main(args=[
                'profile_stat', '-f', "temp_gr_stat.prof", '--calls', '1',
                '-o', 'temp_gr_output.xlsx'
            ],
                 fLOG=st.fprint)
            self.assertIn('+++', str(st))
            self.assertExists('temp_gr_output.xlsx')
示例#16
0
# C++ implementation vs numpy
# +++++++++++++++++++++++++++
#
# :epkg:`scikit-learn` uses :epkg:`numpy` to compute the top *k* elements.

res = benchmark(X32,
                lambda x: topk_sorted_implementation(x, 5, 1, 0),
                lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0),
                N=N)
res

###########################################
# It seems to be faster too. Let's profile.

xr = randn(1000000, 100)
text = profile(lambda: topk_sorted_implementation(xr, 5, 1, 0),
               pyinst_format='text')[1]
print(text)

####################################
# Parallelisation
# +++++++++++++++
#
# We need to disable the parallelisation to
# really compare both implementation.

# In[11]:


def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10):
    res = {}
    for k in tqdm(K):