Exemplo n.º 1
0
def score_data(abt_to_score, modelPath):
    
    '''
    Function to score data 
    :param abt_to_score: A pyspark DataFrame to score
    :param modelPath: The modelpath associated to .zip mleap flavor
    :return: scoredData
    '''
    print('Scoring process starts...')
    
    deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:{}".format(modelPath))
    scoredData = deserializedPipeline.transform(abt_to_score)
    return scoredData  
Exemplo n.º 2
0
# MAGIC %md Serialize to bundle and deserialize

# COMMAND ----------

import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer

model.serializeToBundle(
    "jar:file:/dbfs/mnt/nycitibike/spark-model/lr-spark-model.zip",
    sparkTransformed)

# COMMAND ----------

from pyspark.ml import PipelineModel

deserializedPipeline = PipelineModel.deserializeFromBundle(
    "jar:file:/dbfs/mnt/nycitibike/spark-model/lr-spark-model.zip")

# COMMAND ----------

test_df = testData.limit(10)

# COMMAND ----------

exampleResults = deserializedPipeline.transform(test_df)
display(exampleResults)

# COMMAND ----------

# MAGIC %md Register model
# MAGIC
# MAGIC https://www.mlflow.org/docs/latest/model-registry.html#registering-a-model
Exemplo n.º 3
0
# MAGIC rm -rf /tmp/mleap_python_model_export
# MAGIC mkdir /tmp/mleap_python_model_export
# MAGIC rm -rf /FileStore/future.zip


# save the model for future use.
model.save("/home/sriram/ma_future")

# this saves the pipeline using mleap. not using this but just wanted to demonstrate it.
model.serializeToBundle("jar:file:/home/sriram/future4.zip", transformed)
#dbutils.fs.cp("file:/tmp/mleap_python_model_export/future.zip", "file:/home/sriram/future.zip")
#display(dbutils.fs.ls("dbfs:/FileStore"))


# deserialize the model from the saved bundle. Again this is using mleap.
deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:/home/sriram/future4.zip")
new_model = PipelineModel.load("/home/sriram/ma_future")


# time to test the model.
# add your own comment and check.
d = [{"Comment":"this sucks"}]
df2 = spark.createDataFrame(d)
df2.show()
new_predictions = new_model.transform(df2)
predictions = deserializedPipeline.transform(df2)
abc = new_predictions.select("prediction")

# get the prediction.
print(abc.collect()[0].prediction)
Exemplo n.º 4
0
def load_model(run, artifact_path):
    bundle_uri = f"{run.info.artifact_uri}/{artifact_path}"
    print("bundle_uri:", bundle_uri)
    return PipelineModel.deserializeFromBundle(bundle_uri)
Exemplo n.º 5
0
def load_model_as_spark_bundle(run, artifact_path):
    bundle_uri = f"file:{run.info.artifact_uri}/" + artifact_path
    bundle_uri = bundle_uri.replace("dbfs:", "/dbfs")
    print("bundle_uri:", bundle_uri)
    return PipelineModel.deserializeFromBundle(bundle_uri)
Exemplo n.º 6
0
from mleap.pyspark.spark_support import SimpleSparkSerializer

# serialize the model to a local zip file in JSON format
#model_name_export = "adult_census_pipeline.zip"
model_name_path = cwd
model_file = os.path.join(model_name_path, model_name_export)

# remove an old model file, if needed.
if os.path.isfile(model_file):
    os.remove(model_file)

model_file_path = "jar:file:{}".format(model_file)
model.serializeToBundle(model_file_path, model.transform(train))

## import mleap model
model_deserialized = PipelineModel.deserializeFromBundle(model_file_path)
assert str(model_deserialized) == str(model)

print("The deserialized model is ", model_deserialized)
print("The deserialized model stages are", model_deserialized.stages)

##############################################################################
## export the final model with mleap

## remove the stringIndexer for the label column so it won't be required for prediction
model_final = model.copy()

si_label_index = -3
model_final.stages.pop(si_label_index)  #si_label

## append an IndexToString transformer to the model pipeline to get the original labels
Exemplo n.º 7
0
# COMMAND ----------

import mlflow

client = mlflow.tracking.MlflowClient()
run = client.get_run(run_id)
run.info.artifact_uri

# COMMAND ----------

mleap_path = "{}/mleap-model/mleap/model".format(run.info.artifact_uri)
mleap_path = mleap_path.replace("dbfs:", "/dbfs")
bundle_path = "file:" + mleap_path
bundle_path

# COMMAND ----------

from pyspark.ml import PipelineModel
from mleap.pyspark.spark_support import SimpleSparkSerializer

model = PipelineModel.deserializeFromBundle(bundle_path)
predictions = model.transform(data)
display(predictions.select(colPrediction, colLabel, colFeatures))

# COMMAND ----------

# MAGIC %md #### Return

# COMMAND ----------

dbutils.notebook.exit(run_id)
Exemplo n.º 8
0
def read_model_as_spark_bundle(bundle_uri):
    return PipelineModel.deserializeFromBundle(bundle_uri)
Exemplo n.º 9
0
    def test_profile_sparkml_pipeline(self):
        import inspect
        import os
        import numpy
        import pandas
        import time
        import pathlib
        import mleap.pyspark
        from mleap.pyspark.spark_support import SimpleSparkSerializer
        from pyspark.ml import PipelineModel

        # add additional jar files before creating SparkSession
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv') \
            .options(header='true', inferschema='true').load(input_path)
        training_data, test_data = full_data.randomSplit([0.9, 0.1], seed=1)

        label = "income"
        dtypes = dict(training_data.dtypes)
        dtypes.pop(label)

        si_xvars = []
        ohe_xvars = []
        feature_cols = []
        for idx, key in enumerate(dtypes):
            if dtypes[key] == "string":
                feature_col = "-".join([key, "encoded"])
                feature_cols.append(feature_col)

                tmp_col = "-".join([key, "tmp"])
                si_xvars.append(StringIndexer(inputCol=key, outputCol=tmp_col, handleInvalid="skip"))
                ohe_xvars.append(OneHotEncoderEstimator(inputCols=[tmp_col], outputCols=[feature_col], dropLast=False))
            else:
                feature_cols.append(key)
        si_label = StringIndexer(inputCol=label, outputCol='label')
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        lr = LogisticRegression(regParam=0.001)
        pipeline = Pipeline(stages=si_xvars + ohe_xvars + [si_label, assembler, lr])

        # filter out the records which will cause error
        # use only one record for prediction
        test_data = test_data.limit(1)
        # create Spark and Onnx models
        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', buildInitialTypesSimple(test_data))
        # save Onnx model for runtime usage
        if model_onnx is None: raise AssertionError("Failed to create the onnx model")
        model_path = os.path.join("tests", "profile_pipeline_model.onnx")
        with open(model_path, "wb") as f:
            f.write(model_onnx.SerializeToString())

        # Create MLeap model
        model_zip_path = os.path.join(this_script_dir, "tests", "mleap-pipeline.zip")
        if os.path.exists(model_zip_path):
            os.remove(model_zip_path)
        model_zip_url = "jar:" + pathlib.Path(model_zip_path).as_uri()
        # save the pipeline also in MLeap format
        empty_df = self.spark.createDataFrame([], model.transform(test_data).schema)
        model.serializeToBundle(model_zip_url, empty_df)
        mleap_pipeline = PipelineModel.deserializeFromBundle(model_zip_url)

        spark_times = []
        mleap_times = []
        runtime_times = []
        for i in range(0, 20):
            data_np = buildInputDictSimple(test_data)
            # run the model in Spark
            start = time.time()
            spark_prediction = model.transform(test_data)
            end = time.time()
            spark_times.append(1000 * (end - start))

            # run with MLeap
            start = time.time()
            mleap_prediction = mleap_pipeline.transform(test_data)
            end = time.time()
            mleap_times.append(1000 * (end - start))

            if i == 0:  # compare only once
                _compare_mleap_pyspark(mleap_prediction, spark_prediction)

            # run the model in onnx runtime
            start = time.time()
            output, session = run_with_runtime(data_np, model_path)
            end = time.time()
            runtime_times.append(1000 * (end - start))

            # compare results
            if i == 0:  # compare only once
                expected = [
                    spark_prediction.toPandas().label.values.astype(numpy.float32),
                    spark_prediction.toPandas().prediction.values.astype(numpy.float32),
                    spark_prediction.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(
                        numpy.float32)
                ]
                _compare_expected(expected, output, session, model_path, decimal=5, onnx_shape=None)

        gen_plot(spark_times, mleap_times, runtime_times)
Exemplo n.º 10
0
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel

# refer to this post for more details: http://stackoverflow.com/questions/38669206/spark-2-0-relative-path-in-absolute-uri-spark-warehouse
spark = SparkSession \
    .builder \
    .appName("MNIST Classifier") \
#    .config('spark.sql.warehouse.dir', 'file:///random/path/as/we/need/to/config/this/but/dont/use/it') \
    .config('spark.executor.instances', 10) \
    .getOrCreate()
    
fileNameTest = './mnist_test.csv'


testData = spark.read.csv(fileNameTest, header=True, inferSchema=True)

deserializedPipeline = PipelineModel.deserializeFromBundle("jar:file:/tmp/pipeline-mnist-classifier-json.zip")

result = deserializedPipeline.transform(testData)
print("Result: " + str(result))
#testprediction = bestModel.transform(testData)
#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="labelIndex", metricName="f1")
#print("Precision: " + str(evaluator.evaluate(testprediction)))