def test_model_pipeline_2_stage(self): import inspect import os import numpy import pandas this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select( *cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append( StringIndexer(inputCol=col, outputCol=col + '_index', handleInvalid='skip')) stages.append( OneHotEncoderEstimator(inputCols=[col + '_index'], outputCols=[col + '_vec'])) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml( model, 'Sparkml Pipeline', [('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } predicted_np = [ predicted.toPandas().workclass_vec.apply( lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().education_vec.apply( lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().marital_status_vec.apply( lambda x: pandas.Series(x.toArray())).values ] expected = [ numpy.asarray([expand_one_hot_vec(x) for x in row]) for row in predicted_np ] dump_data_and_sparkml_model(data_np, expected, model, model_onnx, basename="SparkmlPipeline_2Stage")
def test_model_pipeline_3_stage(self): import inspect import os import numpy import pandas this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select( *cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append( StringIndexer(inputCol=col, outputCol=col + '_index', handleInvalid='skip')) # we need the dropLast option otherwise when assembled together (below) # we won't be able to expand the features without difficulties stages.append( OneHotEncoderEstimator(inputCols=[col + '_index'], outputCols=[col + '_vec'], dropLast=False)) stages.append( VectorAssembler(inputCols=[c + '_vec' for c in cols], outputCol='features')) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml( model, 'Sparkml Pipeline', [('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } predicted_np = predicted.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values dump_data_and_sparkml_model(data_np, predicted_np, model, model_onnx, basename="SparkmlPipeline_3Stage")
def test_combine_inputs_with_string(self): from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import make_pipeline model = LabelEncoder() model.fit(['a', 'b', 'b', 'a', 'c']) model_onnx = convert_sklearn(model, 'pipeline', [('input1', StringTensorType([1, 1])), ('input2', StringTensorType([1, 4]))]) self.assertTrue(len(model_onnx.graph.node[-1].output) == 1) self.assertTrue(model_onnx is not None)
def test_label_encoder_converter(self): model = LabelEncoder() model.fit(['str3', 'str2', 'str0', 'str1', 'str3']) model_onnx = convert_sklearn(model, 'scikit-learn label encoder', [('input', StringTensorType([1, 1]))]) self.assertTrue(model_onnx.graph.node is not None)
def test_stop_words_remover(self): data = self.spark.createDataFrame([(["a", "b", "c"], )], ["text"]) model = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) feature_count = len(data.columns) model_onnx = convert_sparkml( model, 'Sparkml StopWordsRemover', [('text', StringTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().words.values data_np = data.toPandas().text.values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStopWordsRemover") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_string_indexer(self): indexer = StringIndexer(inputCol='cat1', outputCol='cat1_index', handleInvalid='skip') data = self.spark.createDataFrame([("a", ), ("b", ), ("c", ), ("a", ), ("a", ), ("c", )], ['cat1']) model = indexer.fit(data) # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml StringIndexer', [('cat1', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) expected = predicted.select("cat1_index").toPandas().values data_np = data.select('cat1').toPandas().values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStringIndexer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['cat1_index'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_random_forrest_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register( "truncateFeatures", lambda x: SparseVector(feature_count, range(0, feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr( "cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml( model, 'Sparkml RandomForest Regressor', [('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_pipeline_4_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip')) stages.append(LogisticRegression(maxIter=100, tol=0.0001)) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('income', StringTensorType([1, 1])), ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'income': test_data.select('income').toPandas().values, 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = [ predicted.toPandas().label.values.astype(numpy.float32), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_4Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_one_hot_encoder_mixed_string_int(self): # categorical_features will be removed in 0.22 (this test will fail by then). data = [["0.4", "0.2", 3], ["1.4", "1.2", 0], ["0.2", "2.2", 1]] model = OneHotEncoder(categories='auto') model.fit(data) inputs = [('input1', StringTensorType([1, 2])), ('input2', Int64TensorType([1, 1]))] model_onnx = convert_sklearn(model, 'one-hot encoder mixed-type inputs', inputs) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="SklearnOneHotEncoderStringInt64", allow_failure=True)
def test_model_label_encoder(self): model = LabelEncoder() data = ['str3', 'str2', 'str0', 'str1', 'str3'] model.fit(data) model_onnx = convert_sklearn(model, 'scikit-learn label encoder', [('input', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) dump_data_and_model(numpy.array(data), model, model_onnx, basename="SklearnLabelEncoder")
def test_model_tfidf_vectorizer13(self): corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vect = TfidfVectorizer(ngram_range=(1, 3)) vect.fit(corpus) pred = vect.transform(corpus) model_onnx = convert_sklearn(vect, 'scikit-learn count vectorizer', [('input', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None)
def test_model_string_indexer(self): indexer = StringIndexer(inputCol='cat1', outputCol='cat1_index', handleInvalid='skip') data = self.spark.createDataFrame([("a",), ("b",), ("c",), ("a",), ("a",), ("c",)], ['cat1']) model = indexer.fit(data) # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml StringIndexer', [('cat1', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) predicted_np = predicted.select("cat1_index").toPandas().values data_np = data.select('cat1').toPandas().values dump_data_and_sparkml_model(data_np, predicted_np, model, model_onnx, basename="SparkmlStringIndexer")
def getTensorTypeFromSpark(sparktype): if sparktype == 'StringType': return StringTensorType([1, 1]) elif sparktype == 'DecimalType' \ or sparktype == 'DoubleType' \ or sparktype == 'FloatType' \ or sparktype == 'LongType' \ or sparktype == 'IntegerType' \ or sparktype == 'ShortType' \ or sparktype == 'ByteType' \ or sparktype == 'BooleanType': return FloatTensorType([1, 1]) else: raise TypeError("Cannot map this type to Onnx types: " + sparktype)
def test_tokenizer(self): data = self.spark.createDataFrame([("a b c",)], ["text"]) model = Tokenizer(inputCol='text', outputCol='words') predicted = model.transform(data) model_onnx = convert_sparkml(model, 'Sparkml Tokenizer', [ ('text', StringTensorType([None]))]) self.assertTrue(model_onnx is not None) # run the model expected = predicted.toPandas().words.apply(pandas.Series).values data_np = data.toPandas().text.values.reshape([-1]) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlTokenizer") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['words'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_dict_vectorizer(self): model = DictVectorizer() data = [{'amy': 1., 'chin': 200.}, {'nice': 3., 'amy': 1.}] model.fit_transform(data) model_onnx = convert_sklearn( model, 'dictionary vectorizer', [('input', DictionaryType(StringTensorType([1]), FloatTensorType([1])))]) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnDictVectorizer-OneOff-SkipDim1", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3') or StrictVersion(onnx.__version__) < StrictVersion('1.3.0')" )
def test_word2vec(self): data = self.spark.createDataFrame( [("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), )], ["text"]) word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(data) vectors = model.getVectors() vectors.show(100, False) result = model.transform(data) result.show(100, False) # the input name should match that of inputCol feature_count = len(data.first()[0]) model_onnx = convert_sparkml( model, 'Sparkml Word2Vec', [('text', StringTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) expected = predicted.toPandas().result.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.limit(1).toPandas().text.values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlWord2Vec") onnx_model_path = paths[-1] data_np = numpy.array(data_np[0]).reshape((1, -1)) output, output_shapes = run_onnx_model(['result'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def calculate_sparkml_stop_words_remover_output_shapes(operator): check_input_and_output_numbers(operator, output_count_range=1) check_input_and_output_types(operator, good_input_types=[StringTensorType]) input_shape = copy.deepcopy(operator.inputs[0].type.shape) operator.outputs[0].type = StringTensorType(input_shape)
def calculate_sparkml_tokenizer_output_shapes(operator): check_input_and_output_numbers(operator, output_count_range=1) check_input_and_output_types(operator, good_input_types=[StringTensorType]) operator.outputs[0].type = StringTensorType()