示例#1
0
def test_spark_udf(spark, model_path):
    mlflow.pyfunc.save_model(
        path=model_path,
        loader_module=__name__,
        code_path=[os.path.dirname(tests.__file__)],
    )

    with mock.patch("mlflow.pyfunc._warn_dependency_requirement_mismatches"
                    ) as mock_check_fn:
        reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path)
        mock_check_fn.assert_called_once()

    pandas_df = pd.DataFrame(data=np.ones((10, 10)),
                             columns=[str(i) for i in range(10)])
    spark_df = spark.createDataFrame(pandas_df)

    # Test all supported return types
    type_map = {
        "float": (FloatType(), np.number),
        "int": (IntegerType(), np.int32),
        "double": (DoubleType(), np.number),
        "long": (LongType(), np.int),
        "string": (StringType(), None),
    }

    for tname, tdef in type_map.items():
        spark_type, np_type = tdef
        prediction_df = reloaded_pyfunc_model.predict(pandas_df)
        for is_array in [True, False]:
            t = ArrayType(spark_type) if is_array else spark_type
            if tname == "string":
                expected = prediction_df.applymap(str)
            else:
                expected = prediction_df.select_dtypes(np_type)
                if tname == "float":
                    expected = expected.astype(np.float32)

            expected = [
                list(row[1]) if is_array else row[1][0]
                for row in expected.iterrows()
            ]
            pyfunc_udf = spark_udf(spark, model_path, result_type=t)
            new_df = spark_df.withColumn("prediction",
                                         pyfunc_udf(*pandas_df.columns))
            actual = list(new_df.select("prediction").toPandas()["prediction"])
            assert expected == actual
            if not is_array:
                pyfunc_udf = spark_udf(spark, model_path, result_type=tname)
                new_df = spark_df.withColumn("prediction",
                                             pyfunc_udf(*pandas_df.columns))
                actual = list(
                    new_df.select("prediction").toPandas()["prediction"])
                assert expected == actual
示例#2
0
    def _model_udf(self) -> Any:
        from mlflow import pyfunc

        spark = default_session()
        return pyfunc.spark_udf(spark,
                                model_uri=self._model_uri,
                                result_type=self._return_type)
def main():
    parser = argparse.ArgumentParser(description="Deploy and test batch model")
    parser.add_argument("-m", "--model_name", help="Model name", required=True)
    parser.add_argument("-r", "--root_path", help="Prefix path", required=True)
    parser.add_argument("-s",
                        "--stage",
                        help="Stage",
                        default="staging",
                        required=True)
    parser.add_argument("-d",
                        "--db_name",
                        help="Output Database name",
                        default="wine",
                        required=False)
    parser.add_argument("-t",
                        "--table_name",
                        help="Output Table name",
                        default="mlops_wine_quality_regression",
                        required=False)
    # parser.add_argument("-p", "--phase", help="Phase", default="qa", required=True)

    args = parser.parse_args()
    model_name = args.model_name
    home = args.root_path
    stage = args.stage
    db = args.db_name.replace("@", "_").replace(".", "_")
    ml_output_predictions_table = args.table_name
    # phase = args.phase

    temp_data_path = f"/dbfs/tmp/mlflow-wine-quality.csv"
    data_uri = "https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv"
    dbfs_wine_data_path = download_wine_file(data_uri, home, temp_data_path)
    wine_df = spark.read.format("csv").option(
        "header", "true").load(dbfs_wine_data_path).drop("quality").cache()
    wine_df = wine_df.select(
        *(col(column).cast("float").alias(column.replace(" ", "_"))
          for column in wine_df.columns))
    data_spark = wine_df

    # wine_data_path = dbfs_wine_data_path.replace("dbfs:", "/dbfs")

    client = mlflow.tracking.MlflowClient()
    latest_model = client.get_latest_versions(name=model_name, stages=[stage])
    print(f"Latest Model: {latest_model}")
    model_uri = "runs:/{}/model".format(latest_model[0].run_id)
    print(f"model_uri: {model_uri}")
    udf = pyfunc.spark_udf(spark, model_uri)

    # data_spark = spark.read.csv(dbfs_wine_data_path, header=True)
    predictions = data_spark.select(
        udf(*data_spark.columns).alias('prediction'), "*")

    spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")
    spark.sql(f"DROP TABLE IF EXISTS {db}.{ml_output_predictions_table}")
    predictions.write.format("delta").mode("overwrite").saveAsTable(
        f"{db}.{ml_output_predictions_table}")

    output = json.dumps({"model_name": model_name, "model_uri": model_uri})

    print(output)
示例#4
0
    def test_spark_udf(self):
        pandas_df = pd.DataFrame(data=np.ones((10, 10)),
                                 columns=[str(i) for i in range(10)])
        spark_df = self.spark.createDataFrame(pandas_df)

        # Test all supported return types
        type_map = {
            "float": (FloatType(), np.number),
            "int": (IntegerType(), np.int32),
            "double": (DoubleType(), np.number),
            "long": (LongType(), np.int),
            "string": (StringType(), None)
        }

        for tname, tdef in type_map.items():
            spark_type, np_type = tdef
            prediction_df = ConstPyfunc.predict(pandas_df)
            for is_array in [True, False]:
                t = ArrayType(spark_type) if is_array else spark_type
                if tname == "string":
                    expected = prediction_df.applymap(str)
                else:
                    expected = prediction_df.select_dtypes(np_type)
                    if tname == "float":
                        expected = expected.astype(np.float32)

                expected = [
                    list(row[1]) if is_array else row[1][0]
                    for row in expected.iterrows()
                ]
                pyfunc_udf = spark_udf(self.spark,
                                       self._model_path,
                                       result_type=t)
                new_df = spark_df.withColumn("prediction",
                                             pyfunc_udf(*pandas_df.columns))
                actual = list(
                    new_df.select("prediction").toPandas()['prediction'])
                assert expected == actual
                if not is_array:
                    pyfunc_udf = spark_udf(self.spark,
                                           self._model_path,
                                           result_type=tname)
                    new_df = spark_df.withColumn(
                        "prediction", pyfunc_udf(*pandas_df.columns))
                    actual = list(
                        new_df.select("prediction").toPandas()['prediction'])
                    assert expected == actual
示例#5
0
def score_model_as_udf(model_uri, pandas_df, result_type="double"):
    spark = get_spark_session(pyspark.SparkConf())
    spark_df = spark.createDataFrame(pandas_df)
    pyfunc_udf = spark_udf(spark=spark,
                           model_uri=model_uri,
                           result_type=result_type)
    new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
    return [x["prediction"] for x in new_df.collect()]
示例#6
0
def score_model_as_udf(model_path, run_id, pandas_df, result_type="double"):
    spark = pyspark.sql.SparkSession.builder \
        .config(key="spark.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark.createDataFrame(pandas_df)
    pyfunc_udf = spark_udf(spark, model_path, run_id, result_type=result_type)
    new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
    return [x['prediction'] for x in new_df.collect()]
示例#7
0
def test_spark_udf_env_manager_predict_sklearn_model(spark, sklearn_model,
                                                     model_path, env_manager):
    model, inference_data = sklearn_model

    mlflow.sklearn.save_model(model, model_path)
    expected_pred_result = model.predict(inference_data)

    infer_data = pd.DataFrame(inference_data, columns=["a", "b"])
    infer_spark_df = spark.createDataFrame(infer_data)

    pyfunc_udf = spark_udf(spark, model_path, env_manager=env_manager)
    result = (infer_spark_df.select(pyfunc_udf(
        "a", "b").alias("predictions")).toPandas().predictions.to_numpy())

    np.testing.assert_allclose(result, expected_pred_result, rtol=1e-5)
示例#8
0
    def test_spark_udf(self):
        pandas_df = self._pandas_df
        spark_df = self.spark.createDataFrame(pandas_df)
        pyfunc_udf = spark_udf(self.spark,
                               self._model_path,
                               result_type="integer")
        new_df = spark_df.withColumn("prediction",
                                     pyfunc_udf(*self._pandas_df.columns))
        spark_results = new_df.collect()

        # Compare against directly running the model.
        direct_model = load_pyfunc(self._model_path)
        pandas_results = direct_model.predict(pandas_df)
        self.assertEqual(178, len(pandas_results))
        self.assertEqual(178, len(spark_results))
        for i in range(0, len(pandas_results)):  # noqa
            self.assertEqual(self._predict[i], pandas_results[i])
            self.assertEqual(pandas_results[i], spark_results[i]['prediction'])
示例#9
0
文件: mlflow.py 项目: zimaxeg/koalas
 def _model_udf(self):
     spark = default_session()
     return pyfunc.spark_udf(spark,
                             model_uri=self._model_uri,
                             result_type=self._return_type)
示例#10
0
import pyspark

from pyspark.sql.types import StringType

from mlflow.pyfunc import spark_udf

if __name__ == '__main__':

    spark = pyspark.sql.SparkSession.builder.getOrCreate()
    spark_df = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                                      (6, "spark hadoop spark"),
                                      (7, "apache hadoop")],
                                     [str(1), str(2)])
    pyfunc_udf = spark_udf(spark,
                           "spark-model",
                           "f2ccde5b33ce456d973ce9f91de8cadf",
                           result_type=StringType())
    new_df = spark_df.withColumn("prediction", pyfunc_udf(str(1), str(2)))
    new_df.show()
示例#11
0
import pyspark
import os

from pyspark.sql.types import DoubleType
from sklearn.model_selection import train_test_split

from mlflow.pyfunc import spark_udf
import pandas as pd

if __name__ == '__main__':

    spark = pyspark.sql.SparkSession.builder.getOrCreate()
    # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
    wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv")
    data = pd.read_csv(wine_path)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    test_y = test[["quality"]]

    pdf = pd.DataFrame(test_y)
    spark_df = spark.createDataFrame(pdf)
    pyfunc_udf = spark_udf(spark, "model", "3774808880c14057abcc89106caa70f9", result_type=DoubleType())
    new_df = spark_df.withColumn("prediction", pyfunc_udf("quality"))
    new_df.show()