def test_model_count_vectorizer_wrong_ngram(self): corpus = numpy.array([ 'A AABBB0', 'AAABB B1', 'AA ABBB2', 'AAAB BB3', 'AAA BBB4', ]).reshape((5, 1)) vect = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w\w+\b") vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer12Wngram-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.3.0')")
def test_model_tfidf_vectorizer11_empty_string_case1(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', ' ', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus[:3].ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) # TfidfVectorizer in onnxruntime fails with empty strings, # which was fixed in version 0.3.0 afterward dump_data_and_model( corpus[2:], vect, model_onnx, basename="SklearnTfidfVectorizer11EmptyStringSepCase1-" "OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')")
def test_one_hot_encoder_mixed_string_int_drop(self): data = [ ["c0.4", "c0.2", 3], ["c1.4", "c1.2", 0], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ] test = [["c0.2", "c2.2", 1]] model = OneHotEncoder(categories="auto", drop=['c0.4', 'c0.2', 3]) model.fit(data) inputs = [ ("input1", StringTensorType([None, 2])), ("input2", Int64TensorType([None, 1])), ] model_onnx = convert_sklearn(model, "one-hot encoder", inputs) self.assertTrue(model_onnx is not None) dump_data_and_model(test, model, model_onnx, verbose=False, basename="SklearnOneHotEncoderMixedStringIntDrop")
def test_model_tfidf_vectorizer11_compose(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) corpus = numpy.hstack([corpus, corpus]) y = numpy.array([0, 1, 0, 1]) model = ColumnTransformer([ ('a', TfidfVectorizer(), 0), ('b', TfidfVectorizer(), 1), ]) model.fit(corpus, y) model_onnx = convert_sklearn(model, "TfIdfcomp", [("input", StringTensorType([4, 2]))], options=self.get_options(), target_opset=TARGET_OPSET) sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': corpus})[0] exp = model.transform(corpus) assert_almost_equal(res, exp)
def test_model_dict_vectorizer(self): model = DictVectorizer() data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}] model.fit_transform(data) model_onnx = convert_sklearn( model, "dictionary vectorizer", [( "input", DictionaryType(StringTensorType([1]), FloatTensorType([1])), )], ) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnDictVectorizer-OneOff-SkipDim1", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.1.3') or " "StrictVersion(onnx.__version__)" " < StrictVersion('1.3.0')", )
def test_model_tfidf_vectorizer11_empty_string_case2(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) # onnxruntime fails with empty strings dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11EmptyString-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def test_model_tfidf_vectorizer_binary(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vect = TfidfVectorizer(binary=True) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizerBinary-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def custom_parser(scope, model, inputs, custom_parsers=None): if custom_parsers is not None and model in custom_parsers: return custom_parsers[model](scope, model, inputs, custom_parsers=custom_parsers) if all( isinstance(i, (numbers.Real, bool, np.bool_)) for i in model.classes_): label_type = Int64TensorType() else: label_type = StringTensorType() output_label = scope.declare_local_variable( 'output_label', label_type) this_operator = scope.declare_local_operator( 'LgbmClassifier', model) this_operator.inputs = inputs probability_map_variable = scope.declare_local_variable( 'output_probability', SequenceType(DictionaryType(label_type, scope.tensor_type()))) this_operator.outputs.append(output_label) this_operator.outputs.append(probability_map_variable) return this_operator.outputs
def test_model_tfidf_vectorizer11_nolowercase(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, lowercase=False) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType())], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11NoL-OneOff-SklCol") sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': corpus.ravel()})[0] assert res.shape == (4, 11)
def test_model_tfidf_vectorizer11_out_vocabulary(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options()) self.assertTrue(model_onnx is not None) corpus = numpy.array([ 'AZZ ZZ This is the first document.', 'BZZ ZZ This document is the second document.', 'ZZZ ZZ And this is the third one.', 'WZZ ZZ Is this the first document?', ]).reshape((4, 1)) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11OutVocabRegex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.4.0')")
def test_model_tfidf_vectorizer11_word4(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, token_pattern="[a-zA-Z]{1,4}") vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11Regex4-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.4.0')")
def test_model_tfidf_transform_bug(self): categories = [ "alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med", ] twenty_train = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=0) text_clf = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer())]) twenty_train.data[0] = "bruît " + twenty_train.data[0] text_clf.fit(twenty_train.data, twenty_train.target) model_onnx = convert_sklearn(text_clf, name="DocClassifierCV-Tfidf", initial_types=[("input", StringTensorType([5]))], target_opset=TARGET_OPSET) dump_data_and_model(twenty_train.data[5:10], text_clf, model_onnx, basename="SklearnPipelineTfidfTransformer")
def test_model_tfidf_vectorizer11_opset(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) for opset in range(8, TARGET_OPSET + 1): try: model_onnx = convert_sklearn( vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=opset) except RuntimeError as e: if "only works for opset" in str(e): continue raise e self.assertTrue(model_onnx is not None) if opset >= 10: name = "SklearnTfidfVectorizer11Rx%d-OneOff-SklCol" % opset dump_data_and_model(corpus, vect, model_onnx, basename=name)
def test_model_tfidf_vectorizer11_short_word_spaces(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', ]).reshape((2, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, analyzer='word', token_pattern=".{1,3}") vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CharW2-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.3.0')", verbose=False)
def common_test_model_tfidf_vectorizer_pipeline_cls( self, kind=None, verbose=False): if kind == 'stop': if StrictVersion(ort_version) >= StrictVersion('1.4.0'): # regression with stopwords in onnxruntime 1.4+ stopwords = ['theh'] else: stopwords = ['the', 'and', 'is'] else: stopwords = None X_train = numpy.array([ "This is the first document", "This document is the second document.", "And this is the third one", "Is this the first document?", ]).reshape((4, 1)) y_train = numpy.array([0, 1, 0, 1]) if kind is None: model_pipeline = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words=stopwords, lowercase=True, use_idf=True, ngram_range=(1, 3), max_features=30000)), ]) elif kind == 'cls': model_pipeline = Pipeline([('vectorizer', TfidfVectorizer(stop_words=stopwords, lowercase=True, use_idf=True, ngram_range=(1, 3), max_features=30000)), ('feature_selector', SelectKBest(k=10)), ('classifier', SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True))]) elif kind == 'stop': model_pipeline = Pipeline([ ('vectorizer', CountVectorizer(stop_words=stopwords, lowercase=True, ngram_range=(1, 2), max_features=30000)), ]) elif kind == 'reg': model_pipeline = Pipeline([('vectorizer', TfidfVectorizer(stop_words=stopwords, lowercase=True, use_idf=True, ngram_range=(1, 3), max_features=30000)), ('feature_selector', SelectKBest(k=10)), ('classifier', SVR(kernel='rbf', gamma='scale'))]) else: raise AssertionError(kind) model_pipeline.fit(X_train.ravel(), y_train) initial_type = [('input', StringTensorType([None, 1]))] model_onnx = convert_sklearn(model_pipeline, "cv", initial_types=initial_type, options={SVC: { 'zipmap': False }}, target_opset=TARGET_OPSET) if kind in (None, 'stop'): exp = [model_pipeline.transform(X_train.ravel()).toarray()] elif kind == 'cls': exp = [ model_pipeline.predict(X_train.ravel()), model_pipeline.predict_proba(X_train.ravel()) ] elif kind == 'reg': exp = [model_pipeline.predict(X_train.ravel()).reshape((-1, 1))] sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X_train}) if verbose: voc = model_pipeline.steps[0][-1].vocabulary_ voc = list(sorted([(v, k) for k, v in voc.items()])) for kv in voc: print(kv) for a, b in zip(exp, got): if verbose: print(stopwords) print(a) print(b) assert_almost_equal(a, b)
def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" if x > 0.5 else "cat2") X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" if x > 0.5 else "cat4") y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3, ) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_transformer = Pipeline(steps=[ ( "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"), ), ( "tsvd", TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), ), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) initial_type = [ ("numfeat", FloatTensorType([None, 3])), ("strfeat", StringTensorType([None, 2])), ] X_train = X_train[:11] model_onnx = convert_sklearn(model, initial_types=initial_type) dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipeliner", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.3') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", ) if __name__ == "__main__": from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer pydot_graph = GetPydotGraph( model_onnx.graph, name=model_onnx.graph.name, rankdir="TP", node_producer=GetOpNodeProducer("docstring"), ) pydot_graph.write_dot("graph.dot") import os os.system("dot -O -G=300 -Tpng graph.dot")
# The default one used by *scikit-learn* uses regular expressions # and is currently being implementing. The current implementation # only considers a list of separators which can is defined # in variable *seps*. seps = { TfidfVectorizer: { "sep": [ ' ', '.', '\\?', ',', ';', ':', '!', '\\(', '\\)', '\n', '"', "'", "-", "\\[", "\\]", "@" ] } } model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[("input", StringTensorType([1, 2]))], options=seps) ################################# # And save. with open("pipeline_tfidf.onnx", "wb") as f: f.write(model_onnx.SerializeToString()) ########################## # Predictions with onnxruntime. sess = rt.InferenceSession("pipeline_tfidf.onnx") print('---', train_data[0]) inputs = {'input': train_data[0]} pred_onx = sess.run(None, inputs) print("predict", pred_onx[0])
def test_pipeline_tfidf_pipeline_minmax(self): categories = ["alt.atheism", "talk.religion.misc"] try: train = fetch_20newsgroups(random_state=1, subset="train", categories=categories) except urllib.error.URLError: warnings.warn("Unit test may fail due to connectivity issue.") return train_data = SubjectBodyExtractor().fit_transform(train.data) pipeline = Pipeline([( "union", ColumnTransformer( [ ("subject", TfidfVectorizer(min_df=50), 0), ("body", TfidfVectorizer(min_df=40), 1), ], transformer_weights={"subject": 0.8}, ), )]) pipeline.fit(train_data[:300]) extra = { TfidfVectorizer: { "separators": [ " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)", "\n", '"', "'", "-", "\\[", "\\]", "@", ] } } model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[ ("input", StringTensorType([None, 2])) ], options=extra, target_opset=TARGET_OPSET) test_data = np.array([ ["Albert Einstein", "Not relatively."], ["Alan turing", "Not automatically."], ]) dump_data_and_model( test_data, pipeline, model_onnx, verbose=False, basename="SklearnDocumentationTfIdfUnion1", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def make_pipelines(df_train, y_train, models=None, sparse_threshold=1., replace_nan=False, insert_replace=False, verbose=False): if models is None: models = [ RandomForestClassifier, HistGradientBoostingClassifier, XGBClassifier, LGBMClassifier ] models = [_ for _ in models if _ is not None] pipes = [] for model in tqdm(models): if model == HistGradientBoostingClassifier: kwargs = dict(max_iter=5) elif model == XGBClassifier: kwargs = dict(n_estimators=5, use_label_encoder=False) else: kwargs = dict(n_estimators=5) if insert_replace: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([ ('count', CountVectorizer()), ('tfidf', TfidfTransformer()), ('repl', ReplaceTransformer()), ]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) else: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([('count', CountVectorizer()), ('tfidf', TfidfTransformer())]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) try: pipe.fit(df_train, y_train) except TypeError as e: obs = dict(model=model.__name__, pipe=pipe, error=e) pipes.append(obs) continue options = {model: {'zipmap': False}} if replace_nan: options[TfidfTransformer] = {'nan': True} # convert with warnings.catch_warnings(record=False): warnings.simplefilter("ignore", (FutureWarning, UserWarning)) model_onnx = to_onnx(pipe, initial_types=[ ('input', FloatTensorType([None, 2])), ('text', StringTensorType([None, 1])) ], target_opset={ '': 14, 'ai.onnx.ml': 2 }, options=options) with open('model.onnx', 'wb') as f: f.write(model_onnx.SerializeToString()) oinf = OnnxInference(model_onnx) inputs = { "input": df[[0, 1]].values.astype(numpy.float32), "text": df[["text"]].values } pred_onx = oinf.run(inputs) diff = numpy.abs(pred_onx['probabilities'].ravel() - pipe.predict_proba(df).ravel()).sum() if verbose: def td(a): if hasattr(a, 'todense'): b = a.todense() ind = set(a.indices) for i in range(b.shape[1]): if i not in ind: b[0, i] = numpy.nan return b return a oinf = OnnxInference(model_onnx) pred_onx2 = oinf.run(inputs) diff2 = numpy.abs(pred_onx2['probabilities'].ravel() - pipe.predict_proba(df).ravel()).sum() if diff > 0.1: for i, (l1, l2) in enumerate( zip(pipe.predict_proba(df), pred_onx['probabilities'])): d = numpy.abs(l1 - l2).sum() if verbose and d > 0.1: print("\nDISCREPENCY DETAILS") print(d, i, l1, l2) pre = pipe.steps[0][-1].transform(df) print("idf", pre[i].dtype, td(pre[i])) pre2 = pipe.steps[1][-1].transform(pre) print("cas", pre2[i].dtype, td(pre2[i])) inter = oinf.run(inputs, intermediate=True) onx = inter['tfidftr_norm'] print("onx", onx.dtype, onx[i]) onx = inter['variable3'] obs = dict(model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe) if verbose: obs['discrepency2'] = diff2 pipes.append(obs) return pipes
# and is currently being implementing. The current implementation # only considers a list of separators which can is defined # in variable *seps*. seps = { TfidfVectorizer: { "separators": [ ' ', '.', '\\?', ',', ';', ':', '!', '\\(', '\\)', '\n', '"', "'", "-", "\\[", "\\]", "@" ] } } model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[("input", StringTensorType([None, 2]))], options=seps, target_opset=12) ################################# # And save. with open("pipeline_tfidf.onnx", "wb") as f: f.write(model_onnx.SerializeToString()) ########################## # Predictions with onnxruntime. sess = rt.InferenceSession("pipeline_tfidf.onnx") print('---', train_data[0]) inputs = {'input': train_data[:1]} pred_onx = sess.run(None, inputs)
def test_pipeline_column_transformer(self): iris = load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" if x > 0.5 else "cat2") X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" if x > 0.5 else "cat4") y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression(C=0.01, class_weight=dict( zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3) numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer( strategy="median")), ("scaler", StandardScaler())]) categorical_transformer = Pipeline(steps=[( "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore") ), ("tsvd", TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4))]) preprocessor = ColumnTransformer( transformers=[("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features)]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) names = list(enumerate_model_names(model, short=False)) simple = [_[0] for _ in names] assert len(set(simple)) == len(simple) names = list(enumerate_model_names(model)) simple2 = [_[0] for _ in names] assert len(simple2) == len(simple) exp = [ '', 'precprocessor', 'precprocessor__num', 'precprocessor__num__imputer', 'precprocessor__num__scaler', 'precprocessor__cat', 'precprocessor__cat__onehot', 'precprocessor__cat__onehot__categories___0', 'precprocessor__cat__onehot__categories___1', 'precprocessor__cat__tsvd', 'classifier' ] self.assertEqual(simple2[:len(exp) - 2], exp[:-2]) initial_type = [("numfeat", FloatTensorType([None, 3])), ("strfeat", StringTensorType([None, 2]))] model_onnx = convert_sklearn(model, initial_types=initial_type, target_opset=TARGET_OPSET) dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipelinerOptions1") options = {'classifier': {'zipmap': False}} new_options = _process_options(model, options) assert len(new_options) == 2 model_onnx = convert_sklearn(model, initial_types=initial_type, options={'classifier': { 'zipmap': False }}, target_opset=TARGET_OPSET) assert 'zipmap' not in str(model_onnx).lower() dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipelinerOptions2") options = {'classifier__zipmap': False} new_options = _process_options(model, options) assert len(new_options) == 2 model_onnx = convert_sklearn(model, initial_types=initial_type, options=options, target_opset=TARGET_OPSET) assert 'zipmap' not in str(model_onnx).lower() dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipelinerOptions2") options = {id(model): {'zipmap': False}} new_options = _process_pipeline_options(model, options) model_onnx = convert_sklearn(model, initial_types=initial_type, options={id(model): { 'zipmap': False }}, target_opset=TARGET_OPSET) assert 'zipmap' not in str(model_onnx).lower() dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipelinerOptions2")
def make_pipelines(df_train, y_train, models=None, sparse_threshold=1., replace_nan=False, insert_replace=False): if models is None: models = [ RandomForestClassifier, HistGradientBoostingClassifier, XGBClassifier, LGBMClassifier] models = [_ for _ in models if _ is not None] pipes = [] for model in tqdm(models): if model == HistGradientBoostingClassifier: kwargs = dict(max_iter=5) elif model == XGBClassifier: kwargs = dict(n_estimators=5, use_label_encoder=False) else: kwargs = dict(n_estimators=5) if insert_replace: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([ ('count', CountVectorizer()), ('tfidf', TfidfTransformer()), ('repl', ReplaceTransformer()), ]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) else: pipe = Pipeline([ ('union', ColumnTransformer([ ('scale1', StandardScaler(), [0, 1]), ('subject', Pipeline([ ('count', CountVectorizer()), ('tfidf', TfidfTransformer()) ]), "text"), ], sparse_threshold=sparse_threshold)), ('cast', CastTransformer()), ('cls', model(max_depth=3, **kwargs)), ]) try: pipe.fit(df_train, y_train) except TypeError as e: obs = dict(model=model.__name__, pipe=pipe, error=e) pipes.append(obs) continue options = {model: {'zipmap': False}} if replace_nan: options[TfidfTransformer] = {'nan': True} # convert with warnings.catch_warnings(record=False): warnings.simplefilter("ignore", (FutureWarning, UserWarning)) model_onnx = to_onnx( pipe, initial_types=[('input', FloatTensorType([None, 2])), ('text', StringTensorType([None, 1]))], target_opset=12, options=options) with open('model.onnx', 'wb') as f: f.write(model_onnx.SerializeToString()) sess = rt.InferenceSession(model_onnx.SerializeToString()) inputs = {"input": df[[0, 1]].values.astype(numpy.float32), "text": df[["text"]].values} pred_onx = sess.run(None, inputs) diff = numpy.abs( pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum() obs = dict(model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe) pipes.append(obs) return pipes
'And this is the third one.', ' ', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 2), norm=None) vect.fit(corpus.ravel()) pred = vect.transform(corpus.ravel()) ########################### # Convert a model into ONNX # +++++++++++++++++++++++++ from skl2onnx import convert_sklearn # noqa from skl2onnx.common.data_types import StringTensorType # noqa model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1, 1]))]) with open("TfidfVectorizer.onnx", "wb") as f: f.write(model_onnx.SerializeToString()) ########################### # Visualize # +++++++++ from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer # noqa pydot_graph = GetPydotGraph(model_onnx.graph, name=model_onnx.graph.name, rankdir="TB", node_producer=GetOpNodeProducer("docstring", color="yellow", fillcolor="yellow",
# only considers a list of separators which can is defined # in variable *seps*. seps = { TfidfVectorizer: { "separators": [ ' ', '.', '\\?', ',', ';', ':', '!', '\\(', '\\)', '\n', '"', "'", "-", "\\[", "\\]", "@" ] } } model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[ ("input", StringTensorType([None, 2]))], options=seps) ################################# # And save. with open("pipeline_tfidf.onnx", "wb") as f: f.write(model_onnx.SerializeToString()) ########################## # Predictions with onnxruntime. sess = rt.InferenceSession("pipeline_tfidf.onnx") print('---', train_data[0]) inputs = {'input': train_data[:1]} pred_onx = sess.run(None, inputs) print("predict", pred_onx[0])