예제 #1
0
def test_log_model_with_code_paths(spark_model_iris):
    artifact_path = "model"
    with mlflow.start_run(), mock.patch(
            "mlflow.spark._add_code_from_conf_to_system_path") as add_mock:
        sparkm.log_model(spark_model=spark_model_iris.model,
                         artifact_path=artifact_path,
                         code_paths=[__file__])
        model_uri = mlflow.get_artifact_uri(artifact_path)
        _compare_logged_code_paths(__file__, model_uri,
                                   mlflow.spark.FLAVOR_NAME)
        sparkm.load_model(model_uri)
        add_mock.assert_called()
예제 #2
0
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator,
                                     should_start_run, use_dfs_tmpdir):
    old_tracking_uri = mlflow.get_tracking_uri()
    if use_dfs_tmpdir:
        dfs_tmpdir = None
    else:
        dfs_tmpdir = tmpdir.join("test").strpath

    try:
        tracking_dir = os.path.abspath(str(tmpdir.join("mlruns")))
        mlflow.set_tracking_uri("file://%s" % tracking_dir)
        if should_start_run:
            mlflow.start_run()
        artifact_path = "model"
        sparkm.log_model(
            artifact_path=artifact_path,
            spark_model=spark_model_estimator.model,
            dfs_tmpdir=dfs_tmpdir,
        )
        model_uri = "runs:/{run_id}/{artifact_path}".format(
            run_id=mlflow.active_run().info.run_id,
            artifact_path=artifact_path)

        reloaded_model = sparkm.load_model(model_uri=model_uri,
                                           dfs_tmpdir=dfs_tmpdir)
        preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
        preds = [x.prediction for x in preds_df.select("prediction").collect()]
        assert spark_model_estimator.predictions == preds
    finally:
        mlflow.end_run()
        mlflow.set_tracking_uri(old_tracking_uri)
예제 #3
0
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator):
    # Print the coefficients and intercept for multinomial logistic regression
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.join("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(
                    artifact_path=artifact_path,
                    spark_model=spark_model_estimator.model,
                    dfs_tmpdir=dfs_tmp_dir,
                )
                model_uri = "runs:/{run_id}/{artifact_path}".format(
                    run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path
                )

                # test reloaded model
                reloaded_model = sparkm.load_model(model_uri=model_uri, dfs_tmpdir=dfs_tmp_dir)
                preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
                preds = [x.prediction for x in preds_df.select("prediction").collect()]
                assert spark_model_estimator.predictions == preds
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                x = dfs_tmp_dir or sparkm.DFS_TMP
                shutil.rmtree(x)
                shutil.rmtree(tracking_dir)
예제 #4
0
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data, columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path, spark_model=model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path, run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # We expect not to delete the DFS tempdir.
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert os.listdir(x)
                shutil.rmtree(x)
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
예제 #5
0
def test_estimator_model_export(spark_model_estimator, model_path, spark_custom_env):
    sparkm.save_model(spark_model_estimator.model, path=model_path, conda_env=spark_custom_env)
    # score and compare the reloaded sparkml model
    reloaded_model = sparkm.load_model(model_uri=model_path)
    preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_estimator.predictions == preds
    # 2. score and compare reloaded pyfunc
    m = pyfunc.load_pyfunc(model_path)
    preds2 = m.predict(spark_model_estimator.spark_df.toPandas())
    assert spark_model_estimator.predictions == preds2
예제 #6
0
def test_sparkml_model_load_from_remote_uri_succeeds(spark_model_iris, model_path, mock_s3_bucket):
    sparkm.save_model(spark_model=spark_model_iris.model, path=model_path)

    artifact_root = "s3://{bucket_name}".format(bucket_name=mock_s3_bucket)
    artifact_path = "model"
    artifact_repo = S3ArtifactRepository(artifact_root)
    artifact_repo.log_artifacts(model_path, artifact_path=artifact_path)

    model_uri = artifact_root + "/" + artifact_path
    reloaded_model = sparkm.load_model(model_uri=model_uri)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds
예제 #7
0
def test_model_log_with_sparkml_format(tmpdir, spark_model_iris):
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = spark_model_iris.model.transform(spark_model_iris.training_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =",
                  dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path,
                                 spark_model=spark_model_iris.model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(spark_model_iris.inference_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path,
                                                   run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df_1 = reloaded_model.transform(
                    spark_model_iris.training_df)
                preds3 = [
                    x.prediction
                    for x in preds_df_1.select("prediction").collect()
                ]
                assert preds1 == preds3
                # test spark_udf
                preds4 = score_model_as_udf(artifact_path, run_id,
                                            spark_model_iris.inference_df)
                assert preds1 == preds4
                # We expect not to delete the DFS tempdir.
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert os.listdir(x)
                shutil.rmtree(x)
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
예제 #8
0
def test_model_export(spark_model_iris, model_path, spark_custom_env):
    sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env)
    # 1. score and compare reloaded sparkml model
    reloaded_model = sparkm.load_model(model_uri=model_path)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds1
    m = pyfunc.load_pyfunc(model_path)
    # 2. score and compare reloaded pyfunc
    preds2 = m.predict(spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds2
    # 3. score and compare reloaded pyfunc Spark udf
    preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds3
    assert os.path.exists(sparkm.DFS_TMP)
예제 #9
0
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    X = iris.data  # we only take the first two features.
    y = iris.target
    pandas_df = pd.DataFrame(X, columns=iris.feature_names)
    pandas_df['label'] = pd.Series(y)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    model_path = tmpdir.mkdir("model")
    assembler = VectorAssembler(inputCols=iris.feature_names,
                                outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = tracking.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        try:
            tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
            tracking.set_tracking_uri("file://%s" % tracking_dir)
            if should_start_run:
                tracking.start_run()
            sparkm.log_model(artifact_path="model", spark_model=model)
            run_id = tracking.active_run().info.run_uuid
            x = pyfunc.load_pyfunc("model", run_id=run_id)
            preds2 = x.predict(pandas_df)
            assert preds1 == preds2
            reloaded_model = sparkm.load_model("model", run_id=run_id)
            preds_df_1 = reloaded_model.transform(spark_df)
            preds3 = [
                x.prediction
                for x in preds_df_1.select("prediction").collect()
            ]
            assert preds1 == preds3
        finally:
            tracking.end_run()
            tracking.set_tracking_uri(old_tracking_uri)
            shutil.rmtree(tracking_dir)
예제 #10
0
def test__NgramSet(numbers_dataframe):

    # Create the transformer
    tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens")

    # Filter to go words
    goWords = ['two', 'three', 'four', 'five']
    gofilt = ct.GoWordFilter(inputCol="tokens",
                             outputCol="go_word_filtered_tokens",
                             goWords=goWords)

    # Create the transformer
    ngrams = ct.NgramSet(inputCol="go_word_filtered_tokens",
                         outputCol="ngram_set",
                         maxN=5)

    # Create a pipeline from the transformer
    pipeline = Pipeline(stages=[tokenizer, gofilt, ngrams])

    # fit the test data (which also builds the pipeline)
    model = pipeline.fit(numbers_dataframe)

    # Test the pipeline
    df_original_transformed = model.transform(numbers_dataframe)

    # Delete any previously save model (if it exists)
    # (There may be a more elegant way to do this)
    if os.path.exists("unit_test_model"):
        os.system("rm -rf unit_test_model")

    # Log the model and performance
    save_model(model, "unit_test_model")
    retrieved_model = load_model("unit_test_model")
    df_retreived_transformed = retrieved_model.transform(numbers_dataframe)

    # Assert the retrieved model give the same results as the saved model
    rows_in_common = df_original_transformed.intersect(
        df_retreived_transformed).count()
    assert (df_original_transformed.count() == rows_in_common)

    # Print results for visual inspection
    print("\n")
    print("test__NgramSet: should see a set of 1-5 ngram set")
    df_retreived_transformed.show(truncate=False)

    # If we make it this far without crashing we pass (plus I'm visually reviewing results)
    assert True
예제 #11
0
def test__LevenshteinSubstituter(numbers_dataframe):

    # Create the transformer
    tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens")

    # Create the transformer
    tokenMatchers = ['two1', 'four2', 'nineee']
    toksub = ct.LevenshteinSubstituter(inputCol="tokens",
                                       outputCol="swapped_tokens",
                                       tokenMatchers=tokenMatchers,
                                       levenshteinThresh=1)

    # Create a pipeline from the transformer
    pipeline = Pipeline(stages=[tokenizer, toksub])

    # fit the test data (which also builds the pipeline)
    model = pipeline.fit(numbers_dataframe)

    # Test the pipeline
    df_original_transformed = model.transform(numbers_dataframe)

    # Delete any previously save model (if it exists)
    # (There may be a more elegant way to do this)
    if os.path.exists("unit_test_model"):
        os.system("rm -rf unit_test_model")

    # Log the model and performance
    save_model(model, "unit_test_model")
    retrieved_model = load_model("unit_test_model")
    df_retreived_transformed = retrieved_model.transform(numbers_dataframe)

    # Assert the retrieved model give the same results as the saved model
    rows_in_common = df_original_transformed.intersect(
        df_retreived_transformed).count()
    assert (df_original_transformed.count() == rows_in_common)

    # Print results for visual inspection
    print("\n")
    print(
        "test__LevenshteinSubstituter: two and four shold be substituted and nine should not"
    )
    df_retreived_transformed.show(truncate=False)

    # If we make it this far without crashing we pass (plus I'm visually reviewing results)
    assert True
예제 #12
0
def test__RegexSubstituter(simple_test_dataframe):

    # Create the transformer
    regexMatchers = ['(?<=[a-zA-Z])\.(?=[A-Z])', '<BR>', '<br>']
    substitutions = ['. ', '. ', '. ']
    transformer = ct.RegexSubstituter(inputCol="text",
                                      outputCol="regexcorrected",
                                      regexMatchers=regexMatchers,
                                      substitutions=substitutions)

    # Create a pipeline from the transformer
    pipeline = Pipeline(stages=[transformer])

    # fit the test data (which also builds the pipeline)
    model = pipeline.fit(simple_test_dataframe)

    # Test the pipeline
    df_original_transformed = model.transform(simple_test_dataframe)

    # Delete any previously save model (if it exists)
    # (There may be a more elegant way to do this)
    if os.path.exists("unit_test_model"):
        os.system("rm -rf unit_test_model")

    # Log the model and performance
    save_model(model, "unit_test_model")
    retrieved_model = load_model("unit_test_model")
    df_retreived_transformed = retrieved_model.transform(simple_test_dataframe)

    # Assert the retrieved model give the same results as the saved model
    rows_in_common = df_original_transformed.intersect(
        df_retreived_transformed).count()
    assert (df_original_transformed.count() == rows_in_common)

    # Print results for visual inspection
    print("\n")
    print(
        "test__RegexSubstituter: The following should show sentences broken into words"
    )
    df_retreived_transformed.show(truncate=False)

    # If we make it this far without crashing we pass (plus I'm visually reviewing results)
    assert True
예제 #13
0
def test_model_export(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    X = iris.data  # we only take the first two features.
    y = iris.target
    pandas_df = pd.DataFrame(X, columns=iris.feature_names)
    pandas_df['label'] = pd.Series(y)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    model_path = tmpdir.mkdir("model")
    assembler = VectorAssembler(inputCols=iris.feature_names,
                                outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    sparkm.save_model(model, path=str(model_path), conda_env=conda_env)
    reloaded_model = sparkm.load_model(path=str(model_path))
    preds_df_1 = reloaded_model.transform(spark_df)
    preds1_1 = [
        x.prediction for x in preds_df_1.select("prediction").collect()
    ]
    assert preds1 == preds1_1
    m = pyfunc.load_pyfunc(str(model_path))
    preds2 = m.predict(pandas_df)
    assert preds1 == preds2
    preds3 = score_model_in_sagemaker_docker_container(
        model_path=str(model_path), data=pandas_df)
    assert preds1 == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    assert not os.listdir(sparkm.DFS_TMP)
예제 #14
0
def test_model_export(spark_model_iris, model_path, spark_conda_env):
    preds_df = spark_model_iris.model.transform(spark_model_iris.training_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    sparkm.save_model(spark_model_iris.model,
                      path=model_path,
                      conda_env=spark_conda_env)
    reloaded_model = sparkm.load_model(path=model_path)
    preds_df_1 = reloaded_model.transform(spark_model_iris.training_df)
    preds1_1 = [
        x.prediction for x in preds_df_1.select("prediction").collect()
    ]
    assert preds1 == preds1_1
    m = pyfunc.load_pyfunc(model_path)
    preds2 = m.predict(spark_model_iris.inference_df)
    assert preds1 == preds2
    preds3 = score_model_in_sagemaker_docker_container(
        model_path=model_path, data=spark_model_iris.inference_df)
    assert preds1 == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    # We expect not to delete the DFS tempdir.
    assert os.listdir(sparkm.DFS_TMP)
예제 #15
0
def test_model_export(spark_model_iris, model_path, spark_conda_env):
    sparkm.save_model(spark_model_iris.model,
                      path=model_path,
                      conda_env=spark_conda_env)
    # 1. score and compare reloaded sparkml model
    reloaded_model = sparkm.load_model(path=model_path)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)

    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds1
    m = pyfunc.load_pyfunc(model_path)
    # 2. score and compare reloaded pyfunc
    preds2 = m.predict(spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds2
    # 3. score and compare reloaded pyfunc Spark udf
    preds3 = score_model_as_udf(model_path,
                                run_id=None,
                                pandas_df=spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    # We expect not to delete the DFS tempdir.
    assert os.listdir(sparkm.DFS_TMP)
예제 #16
0
from __future__ import print_function
import sys
import mlflow
import mlflow.spark as mlflow_spark
from pyspark.sql import SparkSession

print("MLflow Version:", mlflow.version.VERSION)
print("Tracking URI:", mlflow.tracking.get_tracking_uri())

if __name__ == "__main__":
    run_id = sys.argv[1]
    print("run_id:", run_id)
    spark = SparkSession.builder.appName("Predict").getOrCreate()

    data_path = "../data/sample_libsvm_data.txt"
    print("data_path:", data_path)
    data = spark.read.format("libsvm").load(data_path)

    model = mlflow_spark.load_model("spark-model", run_id=run_id)
    predictions = model.transform(data)

    print("Prediction Dataframe")
    predictions.printSchema()

    print("Filtered Prediction Dataframe")
    df = predictions.select("prediction", "indexedLabel",
                            "probability").filter("prediction <> indexedLabel")
    df.printSchema()
    df.show(5, False)