def test_SparkMLGraph(self): spark_model, prototype = get_spark_model_and_prototype() # saving with prototype path = f'{time.time()}.onnx' save_sparkml(spark_model, path, prototype=prototype) load_model(path) assert os.path.exists(path) os.remove(path) # saving with shape and dtype shape = prototype.shape if prototype.dtype == np.float32: dtype = prototype.dtype else: raise RuntimeError( "Test is not configured to run with another type") path = f'{time.time()}.onnx' save_sparkml(spark_model, path, shape=shape, dtype=dtype) assert os.path.exists(path) load_model(path) os.remove(path) # saving with initial_types inital_types = utils.guess_onnx_tensortype(shape=shape, dtype=dtype) path = f'{time.time()}.onnx' save_sparkml(spark_model, path, initial_types=[inital_types]) assert os.path.exists(path) load_model(path) os.remove(path)
executable = sys.executable os.environ["SPARK_HOME"] = pyspark.__path__[0] os.environ["PYSPARK_PYTHON"] = executable os.environ["PYSPARK_DRIVER_PYTHON"] = executable spark = SparkSession.builder.appName("redisai_trial").getOrCreate() original_data = spark.read.format("libsvm").load("sample_libsvm_data.txt") feature_count = 5 spark.udf.register( "truncateFeatures", lambda x: SparseVector(feature_count, range(0, feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("label", "truncateFeatures(features) as features") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4, handleInvalid='error') dt = DecisionTreeRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[feature_indexer, dt]) # (trainingData, testData) = data.randomSplit([0.9, 0.1]) model = pipeline.fit(data) featurestype = utils.guess_onnx_tensortype(node_name='features', dtype='float32', shape=(1, feature_count)) save_sparkml(model, 'spark.onnx', initial_types=[featurestype], spark_session=spark)
import os import sys import pyspark from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegression, OneVsRest from ml2rt import save_sparkml from ml2rt import utils executable = sys.executable os.environ["SPARK_HOME"] = pyspark.__path__[0] os.environ["PYSPARK_PYTHON"] = executable os.environ["PYSPARK_DRIVER_PYTHON"] = executable spark = SparkSession.builder.appName("redisai_trial").getOrCreate() data = spark.read.format("libsvm").load('multiclass_classification_data.txt') lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size tensor_types = utils.guess_onnx_tensortype(node_name='features', dtype='float32', shape=(1, feature_count)) save_sparkml(model, 'spark.onnx', initial_types=[tensor_types])
import os import sys from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors from pyspark.ml.regression import LinearRegression import pyspark from ml2rt import save_sparkml executable = sys.executable os.environ["SPARK_HOME"] = pyspark.__path__[0] os.environ["PYSPARK_PYTHON"] = executable os.environ["PYSPARK_DRIVER_PYTHON"] = executable spark = SparkSession.builder.appName("redisai_trial").getOrCreate() # label is input + 1 data = spark.createDataFrame([(2.0, Vectors.dense(1.0)), (3.0, Vectors.dense(2.0)), (4.0, Vectors.dense(3.0)), (5.0, Vectors.dense(4.0)), (6.0, Vectors.dense(5.0)), (7.0, Vectors.dense(6.0))], ["label", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal") model = lr.fit(data) # the name of the input is 'features' C = model.numFeatures save_sparkml(model, 'linear_regression.onnx', shape=(1, C), dtype='float32')