def test_pipeline(self): def maxdiff(a1, a2): d = numpy.abs(a1.ravel() - a2.ravel()) return d.max() X, y = make_regression(10000, 10, random_state=3) X_train, X_test, y_train, _ = train_test_split(X, y, random_state=3) Xi_train, yi_train = X_train.copy(), y_train.copy() Xi_test = X_test.copy() for i in range(X.shape[1]): Xi_train[:, i] = (Xi_train[:, i] * math.pi * 2**i).astype(numpy.int64) Xi_test[:, i] = (Xi_test[:, i] * math.pi * 2**i).astype(numpy.int64) max_depth = 10 Xi_test = Xi_test.astype(numpy.float32) # model 1 model1 = Pipeline([('scaler', StandardScaler()), ('dt', DecisionTreeRegressor(max_depth=max_depth))]) model1.fit(Xi_train, yi_train) exp1 = model1.predict(Xi_test) onx1 = to_onnx(model1, X_train[:1].astype(numpy.float32), target_opset=TARGET_OPSET) sess1 = InferenceSession(onx1.SerializeToString()) got1 = sess1.run(None, {'X': Xi_test})[0] md1 = maxdiff(exp1, got1) # model 2 model2 = Pipeline([ ('cast64', CastTransformer(dtype=numpy.float64)), ('scaler', StandardScaler()), ('cast', CastTransformer()), ('dt', CastRegressor(DecisionTreeRegressor(max_depth=max_depth), dtype=numpy.float32)) ]) model2.fit(Xi_train, yi_train) exp2 = model2.predict(Xi_test) onx = to_onnx(model2, X_train[:1].astype(numpy.float32), options={StandardScaler: { 'div': 'div_cast' }}, target_opset=TARGET_OPSET) sess2 = InferenceSession(onx.SerializeToString()) got2 = sess2.run(None, {'X': Xi_test})[0] md2 = maxdiff(exp2, got2) assert md2 <= md1 assert md2 <= 0.0
def test_onnx_no_test_data_double(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 if CastTransformer is None: model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) else: # newer version of sklearn-onnx model = make_pipeline( CastTransformer(dtype=np.float32), GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)) np.random.seed(0) X = np.random.rand(100, 200) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", DoubleTensorType([None, X.shape[1]]))], target_opset=11) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model
def common_test_cast_transformer(self, dtype, input_type): model = Pipeline([ ('cast', CastTransformer(dtype=dtype)), ('invcast', CastTransformer(dtype=numpy.float32)), ]) data = numpy.array([[0.1, 0.2, 3.1], [1, 1, 0], [0, 2, 1], [1, 0, 2]], dtype=numpy.float32) model.fit(data) pred = model.steps[0][1].transform(data) assert pred.dtype == dtype model_onnx = convert_sklearn( model, "cast", [("input", FloatTensorType([None, 3]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnCastTransformer{}".format( input_type.__class__.__name__))
def test_cast_transformer_dataframe(self): model = Pipeline([ ('prep', ColumnTransformer([ ('prep1', CastTransformer(), [0, 1]), ('prep2', CastTransformer(), [2]), ])), ('invcast', CastTransformer(dtype=numpy.float32)), ]) data = numpy.array([[0.1, 0.2, 3.4], [1, 1, 0], [0, 2, 1], [1, 0, 2]], dtype=numpy.float32) data = DataFrame(data) model.fit(data) model_onnx = convert_sklearn( model, "cast", [("input", FloatTensorType([None, 3]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( data.values, model, model_onnx, basename="SklearnCastTransformerCT")
# # We could try to use double everywhere. Unfortunately, # :epkg:`ONNX ML Operators` only allows float coefficients # for the operator *TreeEnsembleRegressor*. We may want # to compromise by casting the output of the normalizer into # float in the :epkg:`scikit-learn` pipeline. # # .. blockdiag:: # # diagram { # x_float32 -> normalizer -> y_double -> # cast -> y_float -> dtree -> z_float # } # model2 = Pipeline([('scaler', StandardScaler()), ('cast', CastTransformer()), ('dt', DecisionTreeRegressor(max_depth=max_depth))]) model2.fit(Xi_train, yi_train) ########################################## # The discrepencies. onx2 = to_onnx(model2, Xi_train[:1].astype(numpy.float32)) sess2 = InferenceSession(onx2.SerializeToString(), providers=['CPUExecutionProvider']) skl2 = model2.predict(X32) ort2 = sess2.run(None, {'X': X32})[0]
def test_pandas_batch_onnxml(self): import pandas max_depth = 10 iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target columns = ["vA", "vB", "vC"] X_train = pandas.DataFrame(X, columns=columns) if CastTransformer is None: pipeline = Pipeline(steps=[ ("preprocessor", ColumnTransformer( transformers=[], remainder="passthrough", )), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ]) else: # newer version of sklearn-onnx pipeline = Pipeline(steps=[ ("preprocessor", ColumnTransformer( transformers=[], remainder="passthrough", )), ('cast', CastTransformer(dtype=np.float32)), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ]) pipeline.fit(X_train, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn( pipeline, initial_types=[ ("vA", DoubleTensorType([X.shape[0], 1])), ("vB", DoubleTensorType([X.shape[0], 1])), ("vC", DoubleTensorType([X.shape[0], 1])), ], target_opset=9, ) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch( onnx_ml_model, "onnx", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size) self.assertTrue(hb_model is not None) np.testing.assert_allclose( pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06, )
# ++++++++++++ # # Fixing the conversion requires to replace ``(x * (1 / y)`` # by ``(x / y)`` and this division must happen in double. # By default, the *sklearn-onnx* assumes every # computer should happen in float. `ONNX 1.7 specifications # <https://github.com/onnx/onnx/blob/master/docs/ # Operators-ml.md#ai.onnx.ml.Scaler>`_ # does not support double scaling (input and output does, # but not the parameters). The solution needs to # change the conversion (remove node Scaler by using option # `'div'`) and to use double by inserting an explicit # Cast. model2 = Pipeline([ ('cast64', CastTransformer(dtype=np.float64)), ('scaler', StandardScaler()), ('cast', CastTransformer()), ('dt', DecisionTreeRegressor(max_depth=max_depth)) ]) model2.fit(Xi_train, yi_train) exp2 = model2.predict(Xi_test) onx2 = to_onnx(model2, X_train[:1].astype(np.float32), options={StandardScaler: {'div': 'div_cast'}}, target_opset=15) sess2 = InferenceSession(onx2.SerializeToString()) got2 = sess2.run(None, {'X': Xi_test})[0] md2 = maxdiff(exp2, got2)
def make_pipelines(df_train, y_train, models=None, sparse_threshold=1., replace_nan=False, insert_replace=False, verbose=False): if models is None: models = [ RandomForestClassifier, HistGradientBoostingClassifier, XGBClassifier, LGBMClassifier ] models = [_ for _ in models if _ is not None] pipes = [] for model in tqdm(models): if model == HistGradientBoostingClassifier: kwargs = dict(max_iter=5) elif model == XGBClassifier: kwargs = dict(n_estimators=5, use_label_encoder=False) else: kwargs = dict(n_estimators=5) if insert_replace: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([ ('count', CountVectorizer()), ('tfidf', TfidfTransformer()), ('repl', ReplaceTransformer()), ]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) else: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([('count', CountVectorizer()), ('tfidf', TfidfTransformer())]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) try: pipe.fit(df_train, y_train) except TypeError as e: obs = dict(model=model.__name__, pipe=pipe, error=e) pipes.append(obs) continue options = {model: {'zipmap': False}} if replace_nan: options[TfidfTransformer] = {'nan': True} # convert with warnings.catch_warnings(record=False): warnings.simplefilter("ignore", (FutureWarning, UserWarning)) model_onnx = to_onnx(pipe, initial_types=[ ('input', FloatTensorType([None, 2])), ('text', StringTensorType([None, 1])) ], target_opset={ '': 14, 'ai.onnx.ml': 2 }, options=options) with open('model.onnx', 'wb') as f: f.write(model_onnx.SerializeToString()) oinf = OnnxInference(model_onnx) inputs = { "input": df[[0, 1]].values.astype(numpy.float32), "text": df[["text"]].values } pred_onx = oinf.run(inputs) diff = numpy.abs(pred_onx['probabilities'].ravel() - pipe.predict_proba(df).ravel()).sum() if verbose: def td(a): if hasattr(a, 'todense'): b = a.todense() ind = set(a.indices) for i in range(b.shape[1]): if i not in ind: b[0, i] = numpy.nan return b return a oinf = OnnxInference(model_onnx) pred_onx2 = oinf.run(inputs) diff2 = numpy.abs(pred_onx2['probabilities'].ravel() - pipe.predict_proba(df).ravel()).sum() if diff > 0.1: for i, (l1, l2) in enumerate( zip(pipe.predict_proba(df), pred_onx['probabilities'])): d = numpy.abs(l1 - l2).sum() if verbose and d > 0.1: print("\nDISCREPENCY DETAILS") print(d, i, l1, l2) pre = pipe.steps[0][-1].transform(df) print("idf", pre[i].dtype, td(pre[i])) pre2 = pipe.steps[1][-1].transform(pre) print("cas", pre2[i].dtype, td(pre2[i])) inter = oinf.run(inputs, intermediate=True) onx = inter['tfidftr_norm'] print("onx", onx.dtype, onx[i]) onx = inter['variable3'] obs = dict(model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe) if verbose: obs['discrepency2'] = diff2 pipes.append(obs) return pipes
def make_pipelines(df_train, y_train, models=None, sparse_threshold=1., replace_nan=False, insert_replace=False): if models is None: models = [ RandomForestClassifier, HistGradientBoostingClassifier, XGBClassifier, LGBMClassifier] models = [_ for _ in models if _ is not None] pipes = [] for model in tqdm(models): if model == HistGradientBoostingClassifier: kwargs = dict(max_iter=5) elif model == XGBClassifier: kwargs = dict(n_estimators=5, use_label_encoder=False) else: kwargs = dict(n_estimators=5) if insert_replace: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([ ('count', CountVectorizer()), ('tfidf', TfidfTransformer()), ('repl', ReplaceTransformer()), ]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) else: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([ ('count', CountVectorizer()), ('tfidf', TfidfTransformer()) ]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) try: pipe.fit(df_train, y_train) except TypeError as e: obs = dict(model=model.__name__, pipe=pipe, error=e) pipes.append(obs) continue options = {model: {'zipmap': False}} if replace_nan: options[TfidfTransformer] = {'nan': True} # convert with warnings.catch_warnings(record=False): warnings.simplefilter("ignore", (FutureWarning, UserWarning)) model_onnx = to_onnx( pipe, initial_types=[('input', FloatTensorType([None, 2])), ('text', StringTensorType([None, 1]))], target_opset=12, options=options) with open('model.onnx', 'wb') as f: f.write(model_onnx.SerializeToString()) sess = rt.InferenceSession(model_onnx.SerializeToString()) inputs = {"input": df[[0, 1]].values.astype(numpy.float32), "text": df[["text"]].values} pred_onx = sess.run(None, inputs) diff = numpy.abs( pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum() obs = dict(model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe) pipes.append(obs) return pipes