def test_sagemaker_docker_model_scoring_with_default_conda_env( spark_model_iris, model_path): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=None) scoring_response = score_model_in_sagemaker_docker_container( model_uri=model_path, data=spark_model_iris.pandas_df, content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, flavor=mlflow.pyfunc.FLAVOR_NAME) deployed_model_preds = np.array(json.loads(scoring_response.content)) np.testing.assert_array_almost_equal(deployed_model_preds, spark_model_iris.predictions, decimal=4)
def test_estimator_model_export(spark_model_estimator, model_path, spark_custom_env): sparkm.save_model(spark_model_estimator.model, path=model_path, conda_env=spark_custom_env) # score and compare the reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_estimator.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_estimator.predictions == preds # 2. score and compare reloaded pyfunc m = pyfunc.load_pyfunc(model_path) preds2 = m.predict(spark_model_estimator.spark_df.toPandas()) assert spark_model_estimator.predictions == preds2
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception( spark_model_iris, model_path ): class CustomTransformer(JavaModel): def _transform(self, dataset): return dataset unsupported_pipeline = Pipeline(stages=[CustomTransformer()]) unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df) with pytest.raises(ValueError, match="CustomTransformer"): sparkm.save_model( spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df )
def test_save_with_sample_input_containing_unsupported_data_type_raises_serialization_exception( spark_context, model_path): sql_context = SQLContext(spark_context) unsupported_df = sql_context.createDataFrame([(1, "2016-09-30"), (2, "2017-02-27")]) unsupported_df = unsupported_df.withColumn( "_2", unsupported_df._2.cast(DateType())) pipeline = Pipeline(stages=[]) model = pipeline.fit(unsupported_df) # The Spark `DateType` is not supported by MLeap, so we expect serialization to fail. with pytest.raises(mleap.MLeapSerializationException): sparkm.save_model(spark_model=model, path=model_path, sample_input=unsupported_df)
def test_sparkml_model_save_accepts_conda_env_as_dict(spark_model_iris, model_path): conda_env = dict(mlflow.spark.get_default_conda_env()) conda_env["dependencies"].append("pytest") sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, conda_env=conda_env) pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) assert os.path.exists(saved_conda_env_path) with open(saved_conda_env_path, "r") as f: saved_conda_env_parsed = yaml.safe_load(f) assert saved_conda_env_parsed == conda_env
def test_spark_module_model_save_with_relative_path_and_valid_sample_input_produces_mleap_flavor( spark_model_iris): with TempDir(chdr=True) as tmp: model_path = os.path.basename(tmp.path("model")) mlflow_model = Model() sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, sample_input=spark_model_iris.spark_df, mlflow_model=mlflow_model) assert mleap.FLAVOR_NAME in mlflow_model.flavors config_path = os.path.join(model_path, "MLmodel") assert os.path.exists(config_path) config = Model.load(config_path) assert mleap.FLAVOR_NAME in config.flavors
def test_model_export(spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP)
def test_spark_module_model_save_with_sample_input_produces_sparkml_and_mleap_flavors( spark_model_iris, model_path): mlflow_model = Model() sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, sample_input=spark_model_iris.training_df, mlflow_model=mlflow_model) assert sparkm.FLAVOR_NAME in mlflow_model.flavors assert mleap.FLAVOR_NAME in mlflow_model.flavors config_path = os.path.join(model_path, "MLmodel") assert os.path.exists(config_path) config = Model.load(config_path) assert sparkm.FLAVOR_NAME in config.flavors assert mleap.FLAVOR_NAME in config.flavors
def test_sparkml_model_load_from_remote_uri_succeeds(spark_model_iris, model_path, mock_s3_bucket): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path) artifact_root = "s3://{bucket_name}".format(bucket_name=mock_s3_bucket) artifact_path = "model" artifact_repo = S3ArtifactRepository(artifact_root) artifact_repo.log_artifacts(model_path, artifact_path=artifact_path) model_uri = artifact_root + "/" + artifact_path reloaded_model = sparkm.load_model(model_uri=model_uri) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds
def test_model_deployment(spark_model_iris, model_path, spark_custom_env): sparkm.save_model( spark_model_iris.model, path=model_path, conda_env=spark_custom_env, ) scoring_response = score_model_in_sagemaker_docker_container( model_uri=model_path, data=spark_model_iris.pandas_df, content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED, flavor=mlflow.pyfunc.FLAVOR_NAME, ) np.testing.assert_array_almost_equal( spark_model_iris.predictions, np.array(json.loads(scoring_response.content)), decimal=4 )
def test_sparkml_model_save_persists_specified_conda_env_in_mlflow_model_directory( spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path, conda_env=spark_custom_env) pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) assert os.path.exists(saved_conda_env_path) assert saved_conda_env_path != spark_custom_env with open(spark_custom_env, "r") as f: spark_custom_env_parsed = yaml.safe_load(f) with open(saved_conda_env_path, "r") as f: saved_conda_env_parsed = yaml.safe_load(f) assert saved_conda_env_parsed == spark_custom_env_parsed
def test_model_deployment(spark_model_iris, model_path, spark_conda_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env, # Test both spark ml and mleap sample_input=spark_model_iris.spark_df) # 1. score and compare pyfunc deployed in Sagemaker docker container preds1 = score_model_in_sagemaker_docker_container(model_path=model_path, data=spark_model_iris.pandas_df, flavor=mlflow.pyfunc.FLAVOR_NAME) assert spark_model_iris.predictions == preds1 # 2. score and compare mleap deployed in Sagemaker docker container preds2 = score_model_in_sagemaker_docker_container(model_path=model_path, data=spark_model_iris.pandas_df, flavor=mlflow.mleap.FLAVOR_NAME) assert spark_model_iris.predictions == preds2
def test__NgramSet(numbers_dataframe): # Create the transformer tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens") # Filter to go words goWords = ['two', 'three', 'four', 'five'] gofilt = ct.GoWordFilter(inputCol="tokens", outputCol="go_word_filtered_tokens", goWords=goWords) # Create the transformer ngrams = ct.NgramSet(inputCol="go_word_filtered_tokens", outputCol="ngram_set", maxN=5) # Create a pipeline from the transformer pipeline = Pipeline(stages=[tokenizer, gofilt, ngrams]) # fit the test data (which also builds the pipeline) model = pipeline.fit(numbers_dataframe) # Test the pipeline df_original_transformed = model.transform(numbers_dataframe) # Delete any previously save model (if it exists) # (There may be a more elegant way to do this) if os.path.exists("unit_test_model"): os.system("rm -rf unit_test_model") # Log the model and performance save_model(model, "unit_test_model") retrieved_model = load_model("unit_test_model") df_retreived_transformed = retrieved_model.transform(numbers_dataframe) # Assert the retrieved model give the same results as the saved model rows_in_common = df_original_transformed.intersect( df_retreived_transformed).count() assert (df_original_transformed.count() == rows_in_common) # Print results for visual inspection print("\n") print("test__NgramSet: should see a set of 1-5 ngram set") df_retreived_transformed.show(truncate=False) # If we make it this far without crashing we pass (plus I'm visually reviewing results) assert True
def test_model_export_with_signature_and_examples(iris_df, spark_model_iris): _, _, iris_spark_df = iris_df signature_ = infer_signature(iris_spark_df) example_ = iris_spark_df.toPandas().head(3) for signature in (None, signature_): for example in (None, example_): with TempDir() as tmp: path = tmp.path("model") sparkm.save_model( spark_model_iris.model, path=path, signature=signature, input_example=example ) mlflow_model = Model.load(path) assert signature == mlflow_model.signature if example is None: assert mlflow_model.saved_input_example_info is None else: assert all((_read_example(mlflow_model, path) == example).all())
def test__LevenshteinSubstituter(numbers_dataframe): # Create the transformer tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens") # Create the transformer tokenMatchers = ['two1', 'four2', 'nineee'] toksub = ct.LevenshteinSubstituter(inputCol="tokens", outputCol="swapped_tokens", tokenMatchers=tokenMatchers, levenshteinThresh=1) # Create a pipeline from the transformer pipeline = Pipeline(stages=[tokenizer, toksub]) # fit the test data (which also builds the pipeline) model = pipeline.fit(numbers_dataframe) # Test the pipeline df_original_transformed = model.transform(numbers_dataframe) # Delete any previously save model (if it exists) # (There may be a more elegant way to do this) if os.path.exists("unit_test_model"): os.system("rm -rf unit_test_model") # Log the model and performance save_model(model, "unit_test_model") retrieved_model = load_model("unit_test_model") df_retreived_transformed = retrieved_model.transform(numbers_dataframe) # Assert the retrieved model give the same results as the saved model rows_in_common = df_original_transformed.intersect( df_retreived_transformed).count() assert (df_original_transformed.count() == rows_in_common) # Print results for visual inspection print("\n") print( "test__LevenshteinSubstituter: two and four shold be substituted and nine should not" ) df_retreived_transformed.show(truncate=False) # If we make it this far without crashing we pass (plus I'm visually reviewing results) assert True
def test__RegexSubstituter(simple_test_dataframe): # Create the transformer regexMatchers = ['(?<=[a-zA-Z])\.(?=[A-Z])', '<BR>', '<br>'] substitutions = ['. ', '. ', '. '] transformer = ct.RegexSubstituter(inputCol="text", outputCol="regexcorrected", regexMatchers=regexMatchers, substitutions=substitutions) # Create a pipeline from the transformer pipeline = Pipeline(stages=[transformer]) # fit the test data (which also builds the pipeline) model = pipeline.fit(simple_test_dataframe) # Test the pipeline df_original_transformed = model.transform(simple_test_dataframe) # Delete any previously save model (if it exists) # (There may be a more elegant way to do this) if os.path.exists("unit_test_model"): os.system("rm -rf unit_test_model") # Log the model and performance save_model(model, "unit_test_model") retrieved_model = load_model("unit_test_model") df_retreived_transformed = retrieved_model.transform(simple_test_dataframe) # Assert the retrieved model give the same results as the saved model rows_in_common = df_original_transformed.intersect( df_retreived_transformed).count() assert (df_original_transformed.count() == rows_in_common) # Print results for visual inspection print("\n") print( "test__RegexSubstituter: The following should show sentences broken into words" ) df_retreived_transformed.show(truncate=False) # If we make it this far without crashing we pass (plus I'm visually reviewing results) assert True
def test_model_export(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] sparkm.save_model(model, path=str(model_path), conda_env=conda_env) reloaded_model = sparkm.load_model(path=str(model_path)) preds_df_1 = reloaded_model.transform(spark_df) preds1_1 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds1_1 m = pyfunc.load_pyfunc(str(model_path)) preds2 = m.predict(pandas_df) assert preds1 == preds2 preds3 = score_model_in_sagemaker_docker_container( model_path=str(model_path), data=pandas_df) assert preds1 == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) assert not os.listdir(sparkm.DFS_TMP)
def test_model_export(spark_model_iris, model_path, spark_conda_env): preds_df = spark_model_iris.model.transform(spark_model_iris.training_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env) reloaded_model = sparkm.load_model(path=model_path) preds_df_1 = reloaded_model.transform(spark_model_iris.training_df) preds1_1 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds1_1 m = pyfunc.load_pyfunc(model_path) preds2 = m.predict(spark_model_iris.inference_df) assert preds1 == preds2 preds3 = score_model_in_sagemaker_docker_container( model_path=model_path, data=spark_model_iris.inference_df) assert preds1 == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) # We expect not to delete the DFS tempdir. assert os.listdir(sparkm.DFS_TMP)
def test_model_export(spark_model_iris, model_path, spark_conda_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(path=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_path, run_id=None, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) # We expect not to delete the DFS tempdir. assert os.listdir(sparkm.DFS_TMP)
def test_transformer_model_export(spark_model_transformer, model_path, spark_custom_env): with pytest.raises(MlflowException, match="Cannot serialize this model"): sparkm.save_model( spark_model_transformer.model, path=model_path, conda_env=spark_custom_env )
def model_selection_via_crossvalidation(num_features, reg_param, net_param, cv_num_folds): # hyper parameters for the cross validator num_features = num_features reg_param = reg_param net_param = net_param cv_num_folds = cv_num_folds # Start a new MLflow run with mlflow.start_run(): tokenizer, remover, counts, lr = build_ml_pipeline() pipeline = Pipeline().setStages([tokenizer, remover, counts, lr]) evaluator = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction") paramGrid = ParamGridBuilder() \ .addGrid(counts.numFeatures, num_features) \ .addGrid(lr.regParam, reg_param) \ .addGrid(lr.elasticNetParam, net_param) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=cv_num_folds) # Run cross-validation, and choose the best set of parameters. training_df, validate_df, test_df = prepare_data() logger.warn("training classifier") cv_model = crossval.fit(training_df) # cv_best_pipeline_model == Pipeline model, this holds the best pipeline model based out of cross validation run cv_best_pipeline_model = cv_model.bestModel logger.info("evaluate trained classifier") prediction = cv_model.transform(validate_df) prediction.show(n=10) area_under_ROC = evaluator.evaluate(prediction) logger.info( "Area under the curve metric for the best model selected out of CV: " + str(area_under_ROC)) print("\n area_under_ROC: " + str(area_under_ROC)) accuracy = cv_best_pipeline_model.stages[-1].summary.accuracy logger.info("Accuracy metric for the best model selected out of CV: " + str(accuracy)) print("\n accuracy: " + str(accuracy)) # save trained model to a local directory, in this case under your local system /uap/nlp/ mlflow_spark.save_model(cv_best_pipeline_model, path="pyfunc-cv-model", conda_env=None) # save trained model to a dbfs mlflow_spark.log_model( cv_best_pipeline_model, artifact_path= "/dbfs/tmp/dbconnect-demo/uap/reviews/pyfunc-cv-model", conda_env=None) # save model as spark flavor logger.info( "logging cv_best_pipeline_model as a spark flavor on hosted mlflow server" ) spark_cv_model_path = "spark-cv-model" mlflow_spark.log_model(cv_best_pipeline_model, spark_cv_model_path) # save model as mleap flavor # mleap_cv_model_path = "mleap-cv-model" # mlflow.mleap.log_model(cv_best_pipeline_model, test_df, mleap_cv_model_path) mlflow.log_param( "max_iterations", cv_best_pipeline_model.stages[-1]._java_obj.getMaxIter()) mlflow.log_param( "reg_param", cv_best_pipeline_model.stages[-1]._java_obj.getRegParam()) mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("area_under_ROC", area_under_ROC) cv_runid = mlflow.active_run().info.run_uuid cv_artifactUri = mlflow.get_artifact_uri() logger.warn("\ncv_runid: " + str(cv_runid)) logger.warn("\ncv_artifactUri: " + str(cv_artifactUri)) return cv_runid, cv_artifactUri
def test_sparkml_model_save_without_specified_conda_env_uses_default_env_with_expected_dependencies( spark_model_iris, model_path): sparkm.save_model(spark_model=spark_model_iris.model, path=model_path) _assert_pip_requirements(model_path, sparkm.get_default_pip_requirements())
def test_transformer_model_export(spark_model_transformer, model_path, spark_custom_env): with pytest.raises(MlflowException) as e: sparkm.save_model( spark_model_transformer.model, path=model_path, conda_env=spark_custom_env ) assert "Cannot serialize this model" in e.value.message