def test_log_model_with_code_paths(spark_model_iris): artifact_path = "model" with mlflow.start_run(), mock.patch( "mlflow.spark._add_code_from_conf_to_system_path") as add_mock: sparkm.log_model(spark_model=spark_model_iris.model, artifact_path=artifact_path, code_paths=[__file__]) model_uri = mlflow.get_artifact_uri(artifact_path) _compare_logged_code_paths(__file__, model_uri, mlflow.spark.FLAVOR_NAME) sparkm.load_model(model_uri) add_mock.assert_called()
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator, should_start_run, use_dfs_tmpdir): old_tracking_uri = mlflow.get_tracking_uri() if use_dfs_tmpdir: dfs_tmpdir = None else: dfs_tmpdir = tmpdir.join("test").strpath try: tracking_dir = os.path.abspath(str(tmpdir.join("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model" sparkm.log_model( artifact_path=artifact_path, spark_model=spark_model_estimator.model, dfs_tmpdir=dfs_tmpdir, ) model_uri = "runs:/{run_id}/{artifact_path}".format( run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path) reloaded_model = sparkm.load_model(model_uri=model_uri, dfs_tmpdir=dfs_tmpdir) preds_df = reloaded_model.transform(spark_model_estimator.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_estimator.predictions == preds finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri)
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator): # Print the coefficients and intercept for multinomial logistic regression old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.join("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model( artifact_path=artifact_path, spark_model=spark_model_estimator.model, dfs_tmpdir=dfs_tmp_dir, ) model_uri = "runs:/{run_id}/{artifact_path}".format( run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path ) # test reloaded model reloaded_model = sparkm.load_model(model_uri=model_uri, dfs_tmpdir=dfs_tmp_dir) preds_df = reloaded_model.transform(spark_model_estimator.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_estimator.predictions == preds finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) x = dfs_tmp_dir or sparkm.DFS_TMP shutil.rmtree(x) shutil.rmtree(tracking_dir)
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() feature_names = ["0", "1", "2", "3"] pandas_df = pd.DataFrame(iris.data, columns=feature_names) # to make spark_udf work pandas_df['label'] = pd.Series(iris.target) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model(artifact_path=artifact_path, spark_model=model, dfs_tmpdir=dfs_tmp_dir) run_id = active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id, dfs_tmpdir=dfs_tmp_dir) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()] assert preds1 == preds3 # test spar_udf preds4 = score_model_as_udf(artifact_path, run_id, pandas_df) assert preds1 == preds4 # We expect not to delete the DFS tempdir. x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert os.listdir(x) shutil.rmtree(x) finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test_estimator_model_export(spark_model_estimator, model_path, spark_custom_env): sparkm.save_model(spark_model_estimator.model, path=model_path, conda_env=spark_custom_env) # score and compare the reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_estimator.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_estimator.predictions == preds # 2. score and compare reloaded pyfunc m = pyfunc.load_pyfunc(model_path) preds2 = m.predict(spark_model_estimator.spark_df.toPandas()) assert spark_model_estimator.predictions == preds2
def test_sparkml_model_load_from_remote_uri_succeeds(spark_model_iris, model_path, mock_s3_bucket): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path) artifact_root = "s3://{bucket_name}".format(bucket_name=mock_s3_bucket) artifact_path = "model" artifact_repo = S3ArtifactRepository(artifact_root) artifact_repo.log_artifacts(model_path, artifact_path=artifact_path) model_uri = artifact_root + "/" + artifact_path reloaded_model = sparkm.load_model(model_uri=model_uri) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds
def test_model_log_with_sparkml_format(tmpdir, spark_model_iris): # Print the coefficients and intercept for multinomial logistic regression preds_df = spark_model_iris.model.transform(spark_model_iris.training_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model(artifact_path=artifact_path, spark_model=spark_model_iris.model, dfs_tmpdir=dfs_tmp_dir) run_id = active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(spark_model_iris.inference_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id, dfs_tmpdir=dfs_tmp_dir) preds_df_1 = reloaded_model.transform( spark_model_iris.training_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 # test spark_udf preds4 = score_model_as_udf(artifact_path, run_id, spark_model_iris.inference_df) assert preds1 == preds4 # We expect not to delete the DFS tempdir. x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert os.listdir(x) shutil.rmtree(x) finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test_model_export(spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP)
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) tracking.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: tracking.start_run() sparkm.log_model(artifact_path="model", spark_model=model) run_id = tracking.active_run().info.run_uuid x = pyfunc.load_pyfunc("model", run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 reloaded_model = sparkm.load_model("model", run_id=run_id) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 finally: tracking.end_run() tracking.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test__NgramSet(numbers_dataframe): # Create the transformer tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens") # Filter to go words goWords = ['two', 'three', 'four', 'five'] gofilt = ct.GoWordFilter(inputCol="tokens", outputCol="go_word_filtered_tokens", goWords=goWords) # Create the transformer ngrams = ct.NgramSet(inputCol="go_word_filtered_tokens", outputCol="ngram_set", maxN=5) # Create a pipeline from the transformer pipeline = Pipeline(stages=[tokenizer, gofilt, ngrams]) # fit the test data (which also builds the pipeline) model = pipeline.fit(numbers_dataframe) # Test the pipeline df_original_transformed = model.transform(numbers_dataframe) # Delete any previously save model (if it exists) # (There may be a more elegant way to do this) if os.path.exists("unit_test_model"): os.system("rm -rf unit_test_model") # Log the model and performance save_model(model, "unit_test_model") retrieved_model = load_model("unit_test_model") df_retreived_transformed = retrieved_model.transform(numbers_dataframe) # Assert the retrieved model give the same results as the saved model rows_in_common = df_original_transformed.intersect( df_retreived_transformed).count() assert (df_original_transformed.count() == rows_in_common) # Print results for visual inspection print("\n") print("test__NgramSet: should see a set of 1-5 ngram set") df_retreived_transformed.show(truncate=False) # If we make it this far without crashing we pass (plus I'm visually reviewing results) assert True
def test__LevenshteinSubstituter(numbers_dataframe): # Create the transformer tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens") # Create the transformer tokenMatchers = ['two1', 'four2', 'nineee'] toksub = ct.LevenshteinSubstituter(inputCol="tokens", outputCol="swapped_tokens", tokenMatchers=tokenMatchers, levenshteinThresh=1) # Create a pipeline from the transformer pipeline = Pipeline(stages=[tokenizer, toksub]) # fit the test data (which also builds the pipeline) model = pipeline.fit(numbers_dataframe) # Test the pipeline df_original_transformed = model.transform(numbers_dataframe) # Delete any previously save model (if it exists) # (There may be a more elegant way to do this) if os.path.exists("unit_test_model"): os.system("rm -rf unit_test_model") # Log the model and performance save_model(model, "unit_test_model") retrieved_model = load_model("unit_test_model") df_retreived_transformed = retrieved_model.transform(numbers_dataframe) # Assert the retrieved model give the same results as the saved model rows_in_common = df_original_transformed.intersect( df_retreived_transformed).count() assert (df_original_transformed.count() == rows_in_common) # Print results for visual inspection print("\n") print( "test__LevenshteinSubstituter: two and four shold be substituted and nine should not" ) df_retreived_transformed.show(truncate=False) # If we make it this far without crashing we pass (plus I'm visually reviewing results) assert True
def test__RegexSubstituter(simple_test_dataframe): # Create the transformer regexMatchers = ['(?<=[a-zA-Z])\.(?=[A-Z])', '<BR>', '<br>'] substitutions = ['. ', '. ', '. '] transformer = ct.RegexSubstituter(inputCol="text", outputCol="regexcorrected", regexMatchers=regexMatchers, substitutions=substitutions) # Create a pipeline from the transformer pipeline = Pipeline(stages=[transformer]) # fit the test data (which also builds the pipeline) model = pipeline.fit(simple_test_dataframe) # Test the pipeline df_original_transformed = model.transform(simple_test_dataframe) # Delete any previously save model (if it exists) # (There may be a more elegant way to do this) if os.path.exists("unit_test_model"): os.system("rm -rf unit_test_model") # Log the model and performance save_model(model, "unit_test_model") retrieved_model = load_model("unit_test_model") df_retreived_transformed = retrieved_model.transform(simple_test_dataframe) # Assert the retrieved model give the same results as the saved model rows_in_common = df_original_transformed.intersect( df_retreived_transformed).count() assert (df_original_transformed.count() == rows_in_common) # Print results for visual inspection print("\n") print( "test__RegexSubstituter: The following should show sentences broken into words" ) df_retreived_transformed.show(truncate=False) # If we make it this far without crashing we pass (plus I'm visually reviewing results) assert True
def test_model_export(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] sparkm.save_model(model, path=str(model_path), conda_env=conda_env) reloaded_model = sparkm.load_model(path=str(model_path)) preds_df_1 = reloaded_model.transform(spark_df) preds1_1 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds1_1 m = pyfunc.load_pyfunc(str(model_path)) preds2 = m.predict(pandas_df) assert preds1 == preds2 preds3 = score_model_in_sagemaker_docker_container( model_path=str(model_path), data=pandas_df) assert preds1 == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) assert not os.listdir(sparkm.DFS_TMP)
def test_model_export(spark_model_iris, model_path, spark_conda_env): preds_df = spark_model_iris.model.transform(spark_model_iris.training_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env) reloaded_model = sparkm.load_model(path=model_path) preds_df_1 = reloaded_model.transform(spark_model_iris.training_df) preds1_1 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds1_1 m = pyfunc.load_pyfunc(model_path) preds2 = m.predict(spark_model_iris.inference_df) assert preds1 == preds2 preds3 = score_model_in_sagemaker_docker_container( model_path=model_path, data=spark_model_iris.inference_df) assert preds1 == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) # We expect not to delete the DFS tempdir. assert os.listdir(sparkm.DFS_TMP)
def test_model_export(spark_model_iris, model_path, spark_conda_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(path=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_path, run_id=None, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) # We expect not to delete the DFS tempdir. assert os.listdir(sparkm.DFS_TMP)
from __future__ import print_function import sys import mlflow import mlflow.spark as mlflow_spark from pyspark.sql import SparkSession print("MLflow Version:", mlflow.version.VERSION) print("Tracking URI:", mlflow.tracking.get_tracking_uri()) if __name__ == "__main__": run_id = sys.argv[1] print("run_id:", run_id) spark = SparkSession.builder.appName("Predict").getOrCreate() data_path = "../data/sample_libsvm_data.txt" print("data_path:", data_path) data = spark.read.format("libsvm").load(data_path) model = mlflow_spark.load_model("spark-model", run_id=run_id) predictions = model.transform(data) print("Prediction Dataframe") predictions.printSchema() print("Filtered Prediction Dataframe") df = predictions.select("prediction", "indexedLabel", "probability").filter("prediction <> indexedLabel") df.printSchema() df.show(5, False)