def test_pipeline_voting_tfidf_svc(self): pipe1 = Pipeline([ ('tfidf1', TfidfVectorizer()), ('svc', SVC(probability=True, kernel='linear'))]) pipe2 = Pipeline([ ('tfidf2', TfidfVectorizer(norm='l2', use_idf=False)), ('sgd', SGDClassifier(alpha=0.0001, penalty='l2', loss='modified_huber'))]) pipe3 = Pipeline([ ('tfidf3', TfidfVectorizer()), ('mnb', MultinomialNB())]) voting = VotingClassifier( [('p1', pipe1), ('p2', pipe2), ('p3', pipe3)], voting='soft', flatten_transform=False) data = numpy.array(["first sentance", "second sentence", "many sentances", "dummy sentance", "no sentance at all"]) y = numpy.array([0, 0, 1, 0, 1]) voting.fit(data, y) expected_label = voting.predict(data) expected_proba = voting.predict_proba(data) df = pandas.DataFrame(data) df.columns = ['text'] model_onnx = convert_sklearn( voting, initial_types=[('text', StringTensorType([None, 1]))], target_opset=TARGET_OPSET, options={id(voting): {'zipmap': False}}) # with open("debug.onnx", "wb") as f: # f.write(model_onnx.SerializeToString()) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'text': data.reshape((-1, 1))}) assert_almost_equal(expected_proba, got[1], decimal=5) assert_almost_equal(expected_label, got[0])
def test_pipeline_tfidf_pipeline_minmax(self): categories = ["alt.atheism", "talk.religion.misc"] train = fetch_20newsgroups(random_state=1, subset="train", categories=categories) train_data = SubjectBodyExtractor().fit_transform(train.data) pipeline = Pipeline([( "union", ColumnTransformer( [ ("subject", TfidfVectorizer(min_df=50), 0), ("body", TfidfVectorizer(min_df=40), 1), ], transformer_weights={"subject": 0.8}, ), )]) pipeline.fit(train_data[:300]) extra = { TfidfVectorizer: { "sep": [ " ", ".", "?", ",", ";", ":", "!", "(", ")", "\n", '"', "'", "-", "[", "]", "@", ] } } model_onnx = convert_sklearn( pipeline, "tfidf", initial_types=[("input", StringTensorType([1, 2]))], options=extra, ) test_data = np.array([ ["Albert Einstein", "Not relatively."], ["Alan turing", "Not automatically."], ]) dump_data_and_model( test_data, pipeline, model_onnx, verbose=False, basename="SklearnDocumentationTfIdfUnion1-OneOff-Dec2", allow_failure="StrictVersion(onnx.__version__)" " <= StrictVersion('1.3') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_tfidf_vectorizer11_out_vocabulary(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) pred = vect.transform(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) corpus = numpy.array([ 'AZZ ZZ This is the first document.', 'BZZ ZZ This document is the second document.', 'ZZZ ZZ And this is the third one.', 'WZZ ZZ Is this the first document?', ]).reshape((4, 1)) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.2.1')")
def test_ordinal_encoder_mixed_string_int_drop(self): data = [ ["c0.4", "c0.2", 3], ["c1.4", "c1.2", 0], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ] test = [["c0.2", "c2.2", 1]] model = OrdinalEncoder(categories="auto") model.fit(data) inputs = [ ("input1", StringTensorType([None, 2])), ("input2", Int64TensorType([None, 1])), ] model_onnx = convert_sklearn( model, "ordinal encoder", inputs, target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( test, model, model_onnx, basename="SklearnOrdinalEncoderMixedStringIntDrop", allow_failure="pv.Version(" "onnxruntime.__version__)" "<= pv.Version('0.5.0')", )
def test_model_tfidf_vectorizer11_out_vocabulary(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType([1, 1]))], options=self.get_options()) self.assertTrue(model_onnx is not None) corpus = numpy.array([ "AZZ ZZ This is the first document.", "BZZ ZZ This document is the second document.", "ZZZ ZZ And this is the third one.", "WZZ ZZ Is this the first document?", ]).reshape((4, 1)) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def test_model_tfidf_vectorizer11parenthesis_class(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the (first) document?", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) extra = { TfidfVectorizer: { "separators": [ " ", "\\.", "\\?", ",", ";", ":", "\\!", "\\(", "\\)" ] } } model_onnx = convert_sklearn( vect, "TfidfVectorizer", [("input", StringTensorType([1, 1]))], options=extra, ) self.assertTrue(model_onnx is not None) # This test depends on this issue: # https://github.com/Microsoft/onnxruntime/issues/957. dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11ParenthesisClass-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def test_model_count_vectorizer_wrong_ngram(self): corpus = numpy.array([ 'A AABBB0', 'AAABB B1', 'AA ABBB2', 'AAAB BB3', 'AAA BBB4', ]).reshape((5, 1)) vect = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w\w+\b") vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer12Wngram-OneOff-SklCol", allow_failure="pv.Version(onnxruntime.__version__) <= " "pv.Version('0.3.0')")
def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" if x > 0.5 else "cat2") X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" if x > 0.5 else "cat4") y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression(C=0.01, class_weight=dict( zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_transformer = Pipeline(steps=[ ( "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"), ), ( "tsvd", TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), ), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) initial_type = [ ("numfeat", FloatTensorType([None, 3])), ("strfeat", StringTensorType([None, 2])), ] X_train = X_train[:11] model_onnx = convert_sklearn(model, initial_types=initial_type) dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipeliner")
def test_model_tfidf_vectorizer11parenthesis_class(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the (first) document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) pred = vect.transform(corpus.ravel()) extra = { TfidfVectorizer: { 'sep': [' ', '.', '?', ',', ';', ':', '!', '(', ')'] } } model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1, 1]))], options=extra) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11ParenthesisClass-OneOff-SklCol", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.2.0') or " "StrictVersion(onnx.__version__) <= StrictVersion('1.3')")
def test_model_count_vectorizer_custom_tokenizer(self): corpus = numpy.array([ '9999', '999 99', '1234', '1 2 3 4', '1 2 3 4+', ]).reshape((5, 1)) vect = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda s: [s]) vect.fit(corpus.ravel()) extra = { CountVectorizer: { "separators": ["ZZZZ"] } } prev = vect.tokenizer vect.tokenizer = None model_onnx = convert_sklearn(vect, 'CountVectorizer', [('input', StringTensorType([1, 1]))], options=extra) vect.tokenizer = prev self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.4.0')")
def test_one_hot_encoder_string_drop_first(self): data = [['Male', 'First'], ['Female', 'First'], ['Female', 'Second']] test_data = [['Male', 'Second']] model = OneHotEncoder(drop='first', categories='auto') model.fit(data) inputs = [ ("input1", StringTensorType([None, 1])), ("input2", StringTensorType([None, 1])), ] model_onnx = convert_sklearn( model, "one-hot encoder", inputs, target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( test_data, model, model_onnx, basename="SklearnOneHotEncoderStringDropFirst")
def test_onnxrt_tokenizer_word_stop(self): corpus = numpy.array(['abc ef zoo', 'abc,d', 'ab/e']) exp = numpy.array([['abc', 'ef', 'zoo'], ['abc', '#', '#'], ['ab', 'e', '#']]) op = OnnxTokenizer('text', op_version=TARGET_OPSET, output_names=['out'], separators=[' ', ',', '/'], mark=0, stopwords=['d']) onx = op.to_onnx(inputs=[('text', StringTensorType())], outputs=[('out', StringTensorType())]) oinf = OnnxInference(onx) res = oinf.run({'text': corpus}) self.assertEqual(res['out'].tolist(), exp.tolist())
def test_onnxrt_tokenizer_word_regex_mark_findall(self): corpus = numpy.array(['abc ef zoo', 'abc,d', 'ab/e']) exp = numpy.array([['#', 'abc', '#'], ['#', 'abc', '#'], ['#', 'ab', '#']]) op = OnnxTokenizer('text', op_version=TARGET_OPSET, output_names=['out'], mark=1, tokenexp='[a-c]+', tokenexpsplit=0) onx = op.to_onnx(inputs=[('text', StringTensorType())], outputs=[('out', StringTensorType())]) oinf = OnnxInference(onx) res = oinf.run({'text': corpus}) self.assertEqual(res['out'].tolist(), exp.tolist())
def test_model_tfidf_vectorizer11parenthesis_class(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the (first) document?", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) extra = { TfidfVectorizer: { "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"] } } model_onnx = convert_sklearn( vect, "TfidfVectorizer", [("input", StringTensorType([1, 1]))], options=extra, ) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11ParenthesisClass-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.0') or " "StrictVersion(onnx.__version__)" " <= StrictVersion('1.3')", )
def test_model_tfidf_vectorizer11parenthesis_class(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the (first) document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) extra = { TfidfVectorizer: { 'separators': [' ', '[.]', '\\?', ',', ';', ':', '\\!', '\\(', '\\)'], 'tokenexp': None } } model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=extra, target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) # This test depends on this issue: # https://github.com/Microsoft/onnxruntime/issues/957. dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11ParenthesisClassRegex-" "OneOff-SklCol")
def test_model_tfidf_vectorizer11_out_vocabulary(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) corpus = numpy.array([ 'AZZ ZZ This is the first document.', 'BZZ ZZ This document is the second document.', 'ZZZ ZZ And this is the third one.', 'WZZ ZZ Is this the first document?', ]).reshape((4, 1)) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11OutVocabRegex-OneOff-SklCol")
def test_model_tfidf_vectorizer11_custom_vocabulary(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vc = ["first", "second", "third", "document", "this"] vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, vocabulary=vc) vect.fit(corpus.ravel()) self.assertFalse(hasattr(vect, "stop_words_")) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType())], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CustomVocab-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def test_one_hot_encoder_mixed_string_int_drop(self): data = [ ["c0.4", "c0.2", 3], ["c1.4", "c1.2", 0], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ] test = [["c0.2", "c2.2", 1]] model = OneHotEncoder(categories="auto", drop=['c0.4', 'c0.2', 3]) model.fit(data) inputs = [ ("input1", StringTensorType([None, 2])), ("input2", Int64TensorType([None, 1])), ] model_onnx = convert_sklearn(model, "one-hot encoder", inputs, target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model(test, model, model_onnx, verbose=False, basename="SklearnOneHotEncoderMixedStringIntDrop")
def test_model_tfidf_vectorizer11parenthesis_class(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the (first) document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) extra = { TfidfVectorizer: { 'sep': [' ', '.', '?', ',', ';', ':', '!', '(', ')'], 'regex': None } } model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1, 1]))], options=extra) self.assertTrue(model_onnx is not None) # This test depends on this issue: # https://github.com/Microsoft/onnxruntime/issues/957. dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11ParenthesisClassRegex-" "OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.4.0')")
def test_one_hot_encoder_mixed_string_int(self): # categorical_features will be removed in 0.22 # (this test will fail by then). data = [ ["c0.4", "c0.2", 3], ["c1.4", "c1.2", 0], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ] model = OneHotEncoder(categories="auto") model.fit(data) inputs = [ ("input1", StringTensorType([None, 2])), ("input2", Int64TensorType([None, 1])), ] model_onnx = convert_sklearn(model, "one-hot encoder mixed-type inputs", inputs) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnOneHotEncoderStringInt64", verbose=False, )
def test_model_tfidf_transform_bug(self): categories = [ "alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med", ] twenty_train = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=0) text_clf = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer())]) twenty_train.data[0] = "bruît " + twenty_train.data[0] text_clf.fit(twenty_train.data, twenty_train.target) model_onnx = convert_sklearn( text_clf, name="DocClassifierCV-Tfidf", initial_types=[("input", StringTensorType([5]))], ) dump_data_and_model( twenty_train.data[5:10], text_clf, model_onnx, basename="SklearnPipelineTfidfTransformer", # Operator mul is not implemented in onnxruntime allow_failure="StrictVersion(onnx.__version__)" " <= StrictVersion('1.5')", )
def test_pipeline_tfidf(self): categories = ["alt.atheism", "talk.religion.misc"] train = fetch_20newsgroups(random_state=1, subset="train", categories=categories) train_data = SubjectBodyExtractor().fit_transform(train.data) tfi = TfidfVectorizer(min_df=30) tdata = train_data[:300, :1] tfi.fit(tdata.ravel()) extra = { TfidfVectorizer: { "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"] } } model_onnx = convert_sklearn( tfi, "tfidf", initial_types=[("input", StringTensorType([1, 1]))], options=extra, ) dump_data_and_model( tdata[:5], tfi, model_onnx, basename="SklearnDocumentationTfIdf-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def test_one_hot_encoder_one_string_one_int_cat(self): # categorical_features will be removed in 0.22 # (this test will fail by then). data = [['Male', 1], ['Female', 3], ['Female', 2]] test_data = [['Unknown', 4]] sig = inspect.signature(OneHotEncoder) if "categorical_features" in sig.parameters: # scikit-learn < 0.21 model = OneHotEncoder(handle_unknown='ignore', categorical_features='all') elif "categories" in sig.parameters: # scikit-learn >= 0.22 model = OneHotEncoder(handle_unknown='ignore', categories='auto') else: raise AssertionError("scikit-learn's API has changed.") model.fit(data) inputs = [("input1", StringTensorType([None, 1])), ("input2", Int64TensorType([None, 1]))] model_onnx = convert_sklearn( model, "one-hot encoder one string and int categories", inputs) self.assertTrue(model_onnx is not None) dump_data_and_model( test_data, model, model_onnx, basename="SklearnOneHotEncoderOneStringOneIntCat", )
def test_model_tfidf_vectorizer11_nolowercase(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, lowercase=False) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType())], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11NoL-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", ) sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': corpus.ravel()})[0] assert res.shape == (4, 11)
def test_model_tfidf_vectorizer11_empty_string_case1(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', ' ', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus[:3].ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) # TfidfVectorizer in onnxruntime fails with empty strings, # which was fixed in version 0.3.0 afterward dump_data_and_model( corpus[2:], vect, model_onnx, basename="SklearnTfidfVectorizer11EmptyStringSepCase1-" "OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')")
def test_model_tfidf_vectorizer11_opset(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) for opset in range(8, TARGET_OPSET + 1): try: model_onnx = convert_sklearn( vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=opset) except RuntimeError as e: if "only works for opset" in str(e): continue raise e self.assertTrue(model_onnx is not None) if opset >= 10: name = "SklearnTfidfVectorizer11Rx%d-OneOff-SklCol" % opset dump_data_and_model( corpus, vect, model_onnx, basename=name, allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.4.0')")
def test_pipeline_tfidf(self): categories = ["alt.atheism", "talk.religion.misc"] try: train = fetch_20newsgroups(random_state=1, subset="test", categories=categories) except urllib.error.URLError: warnings.warn("Unit test may fail due to connectivity issue.") return train_data = SubjectBodyExtractor().fit_transform(train.data) tfi = TfidfVectorizer(min_df=30) tdata = train_data[:300, :1] tfi.fit(tdata.ravel()) extra = { TfidfVectorizer: { "separators": [ " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)" ] } } model_onnx = convert_sklearn( tfi, "tfidf", initial_types=[("input", StringTensorType([1]))], options=extra, target_opset=TARGET_OPSET ) dump_data_and_model( tdata[:5], tfi, model_onnx, basename="SklearnDocumentationTfIdf-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", )
def test_pipeline_tfidf_svc(self): pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf_svc', SVC(probability=True, kernel='linear'))]) data = numpy.array([ "first sentance", "second sentence", "many sentances", "dummy sentance", "no sentance at all" ]) y = numpy.array([0, 0, 1, 0, 1]) pipe.fit(data, y) expected_label = pipe.predict(data) expected_proba = pipe.predict_proba(data) df = pandas.DataFrame(data) df.columns = ['text'] # first conversion if shape=[None, 1] model_onnx = convert_sklearn(pipe, initial_types=[ ('text', StringTensorType([None, 1])) ], target_opset=TARGET_OPSET, options={id(pipe): { 'zipmap': False }}) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'text': data.reshape((-1, 1))}) assert_almost_equal(expected_proba, got[1]) assert_almost_equal(expected_label, got[0]) # sess.run(None, {'text': df}) --> failures # sess.run(None, {'text': df["text"]}) --> failures # second conversion with shape=[None] model_onnx = convert_sklearn(pipe, initial_types=[ ('text', StringTensorType([None])) ], target_opset=TARGET_OPSET, options={id(pipe): { 'zipmap': False }}) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'text': data}) assert_almost_equal(expected_proba, got[1]) assert_almost_equal(expected_label, got[0]) # sess.run(None, {'text': df}) failure # sess.run(None, {'text': df["text"]}) failure sess.run(None, {'text': df["text"].values}) # success
def test_model_dict_vectorizer_issue(self): key_value_map = [{1: 'A', 2: 'B'}, {1: 'C', 3: 'D'}, {1: 'C', 3: 'A'}] model = DictVectorizer(sparse=False).fit(key_value_map) with self.assertRaises(RuntimeError): convert_sklearn(model, 'dv', [("input", DictionaryType(Int64TensorType([1]), StringTensorType([1])))])
def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None): """ Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`. """ X = numpy.array([_[0] for _ in text_alpha_num]) y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) itt = [("X", StringTensorType([None]))] return (X, y, itt, 'transform', 0, X)