def test_sagemaker_docker_model_scoring_with_default_conda_env(
        spark_model_iris, model_path):
    sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=None)

    scoring_response = score_model_in_sagemaker_docker_container(
        model_uri=model_path,
        data=spark_model_iris.pandas_df,
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
        flavor=mlflow.pyfunc.FLAVOR_NAME)
    deployed_model_preds = np.array(json.loads(scoring_response.content))

    np.testing.assert_array_almost_equal(deployed_model_preds,
                                         spark_model_iris.predictions,
                                         decimal=4)
def test_estimator_model_export(spark_model_estimator, model_path,
                                spark_custom_env):
    sparkm.save_model(spark_model_estimator.model,
                      path=model_path,
                      conda_env=spark_custom_env)
    # score and compare the reloaded sparkml model
    reloaded_model = sparkm.load_model(model_uri=model_path)
    preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_estimator.predictions == preds
    # 2. score and compare reloaded pyfunc
    m = pyfunc.load_pyfunc(model_path)
    preds2 = m.predict(spark_model_estimator.spark_df.toPandas())
    assert spark_model_estimator.predictions == preds2
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception(
    spark_model_iris, model_path
):
    class CustomTransformer(JavaModel):
        def _transform(self, dataset):
            return dataset

    unsupported_pipeline = Pipeline(stages=[CustomTransformer()])
    unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df)

    with pytest.raises(ValueError, match="CustomTransformer"):
        sparkm.save_model(
            spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df
        )
示例#4
0
def test_save_with_sample_input_containing_unsupported_data_type_raises_serialization_exception(
        spark_context, model_path):
    sql_context = SQLContext(spark_context)
    unsupported_df = sql_context.createDataFrame([(1, "2016-09-30"),
                                                  (2, "2017-02-27")])
    unsupported_df = unsupported_df.withColumn(
        "_2", unsupported_df._2.cast(DateType()))
    pipeline = Pipeline(stages=[])
    model = pipeline.fit(unsupported_df)
    # The Spark `DateType` is not supported by MLeap, so we expect serialization to fail.
    with pytest.raises(mleap.MLeapSerializationException):
        sparkm.save_model(spark_model=model,
                          path=model_path,
                          sample_input=unsupported_df)
示例#5
0
def test_sparkml_model_save_accepts_conda_env_as_dict(spark_model_iris, model_path):
    conda_env = dict(mlflow.spark.get_default_conda_env())
    conda_env["dependencies"].append("pytest")
    sparkm.save_model(spark_model=spark_model_iris.model,
                      path=model_path,
                      conda_env=conda_env)

    pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME)
    saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV])
    assert os.path.exists(saved_conda_env_path)

    with open(saved_conda_env_path, "r") as f:
        saved_conda_env_parsed = yaml.safe_load(f)
    assert saved_conda_env_parsed == conda_env
示例#6
0
def test_spark_module_model_save_with_relative_path_and_valid_sample_input_produces_mleap_flavor(
        spark_model_iris):
    with TempDir(chdr=True) as tmp:
        model_path = os.path.basename(tmp.path("model"))
        mlflow_model = Model()
        sparkm.save_model(spark_model=spark_model_iris.model,
                          path=model_path,
                          sample_input=spark_model_iris.spark_df,
                          mlflow_model=mlflow_model)
        assert mleap.FLAVOR_NAME in mlflow_model.flavors

        config_path = os.path.join(model_path, "MLmodel")
        assert os.path.exists(config_path)
        config = Model.load(config_path)
        assert mleap.FLAVOR_NAME in config.flavors
示例#7
0
def test_model_export(spark_model_iris, model_path, spark_custom_env):
    sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env)
    # 1. score and compare reloaded sparkml model
    reloaded_model = sparkm.load_model(model_uri=model_path)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds1
    m = pyfunc.load_pyfunc(model_path)
    # 2. score and compare reloaded pyfunc
    preds2 = m.predict(spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds2
    # 3. score and compare reloaded pyfunc Spark udf
    preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds3
    assert os.path.exists(sparkm.DFS_TMP)
def test_spark_module_model_save_with_sample_input_produces_sparkml_and_mleap_flavors(
        spark_model_iris, model_path):
    mlflow_model = Model()
    sparkm.save_model(spark_model=spark_model_iris.model,
                      path=model_path,
                      sample_input=spark_model_iris.training_df,
                      mlflow_model=mlflow_model)
    assert sparkm.FLAVOR_NAME in mlflow_model.flavors
    assert mleap.FLAVOR_NAME in mlflow_model.flavors

    config_path = os.path.join(model_path, "MLmodel")
    assert os.path.exists(config_path)
    config = Model.load(config_path)
    assert sparkm.FLAVOR_NAME in config.flavors
    assert mleap.FLAVOR_NAME in config.flavors
示例#9
0
def test_sparkml_model_load_from_remote_uri_succeeds(spark_model_iris,
                                                     model_path,
                                                     mock_s3_bucket):
    sparkm.save_model(spark_model=spark_model_iris.model, path=model_path)

    artifact_root = "s3://{bucket_name}".format(bucket_name=mock_s3_bucket)
    artifact_path = "model"
    artifact_repo = S3ArtifactRepository(artifact_root)
    artifact_repo.log_artifacts(model_path, artifact_path=artifact_path)

    model_uri = artifact_root + "/" + artifact_path
    reloaded_model = sparkm.load_model(model_uri=model_uri)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds
示例#10
0
def test_model_deployment(spark_model_iris, model_path, spark_custom_env):
    sparkm.save_model(
        spark_model_iris.model,
        path=model_path,
        conda_env=spark_custom_env,
    )
    scoring_response = score_model_in_sagemaker_docker_container(
        model_uri=model_path,
        data=spark_model_iris.pandas_df,
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED,
        flavor=mlflow.pyfunc.FLAVOR_NAME,
    )
    np.testing.assert_array_almost_equal(
        spark_model_iris.predictions, np.array(json.loads(scoring_response.content)), decimal=4
    )
示例#11
0
def test_sparkml_model_save_persists_specified_conda_env_in_mlflow_model_directory(
        spark_model_iris, model_path, spark_custom_env):
    sparkm.save_model(spark_model=spark_model_iris.model,
                      path=model_path,
                      conda_env=spark_custom_env)

    pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME)
    saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV])
    assert os.path.exists(saved_conda_env_path)
    assert saved_conda_env_path != spark_custom_env

    with open(spark_custom_env, "r") as f:
        spark_custom_env_parsed = yaml.safe_load(f)
    with open(saved_conda_env_path, "r") as f:
        saved_conda_env_parsed = yaml.safe_load(f)
    assert saved_conda_env_parsed == spark_custom_env_parsed
def test_model_deployment(spark_model_iris, model_path, spark_conda_env):
    sparkm.save_model(spark_model_iris.model, path=model_path,
                      conda_env=spark_conda_env,
                      # Test both spark ml and mleap
                      sample_input=spark_model_iris.spark_df)

    # 1. score and compare pyfunc deployed in Sagemaker docker container
    preds1 = score_model_in_sagemaker_docker_container(model_path=model_path,
                                                       data=spark_model_iris.pandas_df,
                                                       flavor=mlflow.pyfunc.FLAVOR_NAME)
    assert spark_model_iris.predictions == preds1
    # 2. score and compare mleap deployed in Sagemaker docker container
    preds2 = score_model_in_sagemaker_docker_container(model_path=model_path,
                                                       data=spark_model_iris.pandas_df,
                                                       flavor=mlflow.mleap.FLAVOR_NAME)
    assert spark_model_iris.predictions == preds2
示例#13
0
def test__NgramSet(numbers_dataframe):

    # Create the transformer
    tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens")

    # Filter to go words
    goWords = ['two', 'three', 'four', 'five']
    gofilt = ct.GoWordFilter(inputCol="tokens",
                             outputCol="go_word_filtered_tokens",
                             goWords=goWords)

    # Create the transformer
    ngrams = ct.NgramSet(inputCol="go_word_filtered_tokens",
                         outputCol="ngram_set",
                         maxN=5)

    # Create a pipeline from the transformer
    pipeline = Pipeline(stages=[tokenizer, gofilt, ngrams])

    # fit the test data (which also builds the pipeline)
    model = pipeline.fit(numbers_dataframe)

    # Test the pipeline
    df_original_transformed = model.transform(numbers_dataframe)

    # Delete any previously save model (if it exists)
    # (There may be a more elegant way to do this)
    if os.path.exists("unit_test_model"):
        os.system("rm -rf unit_test_model")

    # Log the model and performance
    save_model(model, "unit_test_model")
    retrieved_model = load_model("unit_test_model")
    df_retreived_transformed = retrieved_model.transform(numbers_dataframe)

    # Assert the retrieved model give the same results as the saved model
    rows_in_common = df_original_transformed.intersect(
        df_retreived_transformed).count()
    assert (df_original_transformed.count() == rows_in_common)

    # Print results for visual inspection
    print("\n")
    print("test__NgramSet: should see a set of 1-5 ngram set")
    df_retreived_transformed.show(truncate=False)

    # If we make it this far without crashing we pass (plus I'm visually reviewing results)
    assert True
示例#14
0
def test_model_export_with_signature_and_examples(iris_df, spark_model_iris):
    _, _, iris_spark_df = iris_df
    signature_ = infer_signature(iris_spark_df)
    example_ = iris_spark_df.toPandas().head(3)
    for signature in (None, signature_):
        for example in (None, example_):
            with TempDir() as tmp:
                path = tmp.path("model")
                sparkm.save_model(
                    spark_model_iris.model, path=path, signature=signature, input_example=example
                )
                mlflow_model = Model.load(path)
                assert signature == mlflow_model.signature
                if example is None:
                    assert mlflow_model.saved_input_example_info is None
                else:
                    assert all((_read_example(mlflow_model, path) == example).all())
示例#15
0
def test__LevenshteinSubstituter(numbers_dataframe):

    # Create the transformer
    tokenizer = ct.NLTKWordPunctTokenizer(inputCol="text", outputCol="tokens")

    # Create the transformer
    tokenMatchers = ['two1', 'four2', 'nineee']
    toksub = ct.LevenshteinSubstituter(inputCol="tokens",
                                       outputCol="swapped_tokens",
                                       tokenMatchers=tokenMatchers,
                                       levenshteinThresh=1)

    # Create a pipeline from the transformer
    pipeline = Pipeline(stages=[tokenizer, toksub])

    # fit the test data (which also builds the pipeline)
    model = pipeline.fit(numbers_dataframe)

    # Test the pipeline
    df_original_transformed = model.transform(numbers_dataframe)

    # Delete any previously save model (if it exists)
    # (There may be a more elegant way to do this)
    if os.path.exists("unit_test_model"):
        os.system("rm -rf unit_test_model")

    # Log the model and performance
    save_model(model, "unit_test_model")
    retrieved_model = load_model("unit_test_model")
    df_retreived_transformed = retrieved_model.transform(numbers_dataframe)

    # Assert the retrieved model give the same results as the saved model
    rows_in_common = df_original_transformed.intersect(
        df_retreived_transformed).count()
    assert (df_original_transformed.count() == rows_in_common)

    # Print results for visual inspection
    print("\n")
    print(
        "test__LevenshteinSubstituter: two and four shold be substituted and nine should not"
    )
    df_retreived_transformed.show(truncate=False)

    # If we make it this far without crashing we pass (plus I'm visually reviewing results)
    assert True
示例#16
0
def test__RegexSubstituter(simple_test_dataframe):

    # Create the transformer
    regexMatchers = ['(?<=[a-zA-Z])\.(?=[A-Z])', '<BR>', '<br>']
    substitutions = ['. ', '. ', '. ']
    transformer = ct.RegexSubstituter(inputCol="text",
                                      outputCol="regexcorrected",
                                      regexMatchers=regexMatchers,
                                      substitutions=substitutions)

    # Create a pipeline from the transformer
    pipeline = Pipeline(stages=[transformer])

    # fit the test data (which also builds the pipeline)
    model = pipeline.fit(simple_test_dataframe)

    # Test the pipeline
    df_original_transformed = model.transform(simple_test_dataframe)

    # Delete any previously save model (if it exists)
    # (There may be a more elegant way to do this)
    if os.path.exists("unit_test_model"):
        os.system("rm -rf unit_test_model")

    # Log the model and performance
    save_model(model, "unit_test_model")
    retrieved_model = load_model("unit_test_model")
    df_retreived_transformed = retrieved_model.transform(simple_test_dataframe)

    # Assert the retrieved model give the same results as the saved model
    rows_in_common = df_original_transformed.intersect(
        df_retreived_transformed).count()
    assert (df_original_transformed.count() == rows_in_common)

    # Print results for visual inspection
    print("\n")
    print(
        "test__RegexSubstituter: The following should show sentences broken into words"
    )
    df_retreived_transformed.show(truncate=False)

    # If we make it this far without crashing we pass (plus I'm visually reviewing results)
    assert True
def test_model_export(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(
        conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    X = iris.data  # we only take the first two features.
    y = iris.target
    pandas_df = pd.DataFrame(X, columns=iris.feature_names)
    pandas_df['label'] = pd.Series(y)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    model_path = tmpdir.mkdir("model")
    assembler = VectorAssembler(inputCols=iris.feature_names,
                                outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    sparkm.save_model(model, path=str(model_path), conda_env=conda_env)
    reloaded_model = sparkm.load_model(path=str(model_path))
    preds_df_1 = reloaded_model.transform(spark_df)
    preds1_1 = [
        x.prediction for x in preds_df_1.select("prediction").collect()
    ]
    assert preds1 == preds1_1
    m = pyfunc.load_pyfunc(str(model_path))
    preds2 = m.predict(pandas_df)
    assert preds1 == preds2
    preds3 = score_model_in_sagemaker_docker_container(
        model_path=str(model_path), data=pandas_df)
    assert preds1 == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    assert not os.listdir(sparkm.DFS_TMP)
示例#18
0
def test_model_export(spark_model_iris, model_path, spark_conda_env):
    preds_df = spark_model_iris.model.transform(spark_model_iris.training_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    sparkm.save_model(spark_model_iris.model,
                      path=model_path,
                      conda_env=spark_conda_env)
    reloaded_model = sparkm.load_model(path=model_path)
    preds_df_1 = reloaded_model.transform(spark_model_iris.training_df)
    preds1_1 = [
        x.prediction for x in preds_df_1.select("prediction").collect()
    ]
    assert preds1 == preds1_1
    m = pyfunc.load_pyfunc(model_path)
    preds2 = m.predict(spark_model_iris.inference_df)
    assert preds1 == preds2
    preds3 = score_model_in_sagemaker_docker_container(
        model_path=model_path, data=spark_model_iris.inference_df)
    assert preds1 == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    # We expect not to delete the DFS tempdir.
    assert os.listdir(sparkm.DFS_TMP)
示例#19
0
def test_model_export(spark_model_iris, model_path, spark_conda_env):
    sparkm.save_model(spark_model_iris.model,
                      path=model_path,
                      conda_env=spark_conda_env)
    # 1. score and compare reloaded sparkml model
    reloaded_model = sparkm.load_model(path=model_path)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)

    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds1
    m = pyfunc.load_pyfunc(model_path)
    # 2. score and compare reloaded pyfunc
    preds2 = m.predict(spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds2
    # 3. score and compare reloaded pyfunc Spark udf
    preds3 = score_model_as_udf(model_path,
                                run_id=None,
                                pandas_df=spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    # We expect not to delete the DFS tempdir.
    assert os.listdir(sparkm.DFS_TMP)
示例#20
0
def test_transformer_model_export(spark_model_transformer, model_path, spark_custom_env):
    with pytest.raises(MlflowException, match="Cannot serialize this model"):
        sparkm.save_model(
            spark_model_transformer.model, path=model_path, conda_env=spark_custom_env
        )
示例#21
0
def model_selection_via_crossvalidation(num_features, reg_param, net_param,
                                        cv_num_folds):
    # hyper parameters for the cross validator
    num_features = num_features
    reg_param = reg_param
    net_param = net_param
    cv_num_folds = cv_num_folds

    # Start a new MLflow run
    with mlflow.start_run():
        tokenizer, remover, counts, lr = build_ml_pipeline()
        pipeline = Pipeline().setStages([tokenizer, remover, counts, lr])
        evaluator = BinaryClassificationEvaluator(
            rawPredictionCol="rawPrediction")

        paramGrid = ParamGridBuilder() \
            .addGrid(counts.numFeatures, num_features) \
            .addGrid(lr.regParam, reg_param) \
            .addGrid(lr.elasticNetParam, net_param) \
            .build()

        crossval = CrossValidator(estimator=pipeline,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=cv_num_folds)

        # Run cross-validation, and choose the best set of parameters.
        training_df, validate_df, test_df = prepare_data()
        logger.warn("training classifier")
        cv_model = crossval.fit(training_df)

        # cv_best_pipeline_model == Pipeline model, this holds the best pipeline model based out of cross validation run
        cv_best_pipeline_model = cv_model.bestModel

        logger.info("evaluate trained classifier")
        prediction = cv_model.transform(validate_df)
        prediction.show(n=10)

        area_under_ROC = evaluator.evaluate(prediction)
        logger.info(
            "Area under the curve metric for the best model selected out of CV: "
            + str(area_under_ROC))
        print("\n area_under_ROC: " + str(area_under_ROC))

        accuracy = cv_best_pipeline_model.stages[-1].summary.accuracy
        logger.info("Accuracy metric for the best model selected out of CV: " +
                    str(accuracy))
        print("\n accuracy: " + str(accuracy))

        # save trained model to a local directory, in this case under your local system /uap/nlp/
        mlflow_spark.save_model(cv_best_pipeline_model,
                                path="pyfunc-cv-model",
                                conda_env=None)

        # save trained model to a dbfs
        mlflow_spark.log_model(
            cv_best_pipeline_model,
            artifact_path=
            "/dbfs/tmp/dbconnect-demo/uap/reviews/pyfunc-cv-model",
            conda_env=None)

        # save model as spark flavor
        logger.info(
            "logging cv_best_pipeline_model as a spark flavor on hosted mlflow server"
        )
        spark_cv_model_path = "spark-cv-model"
        mlflow_spark.log_model(cv_best_pipeline_model, spark_cv_model_path)

        # save model as mleap flavor
        # mleap_cv_model_path = "mleap-cv-model"
        # mlflow.mleap.log_model(cv_best_pipeline_model, test_df, mleap_cv_model_path)

        mlflow.log_param(
            "max_iterations",
            cv_best_pipeline_model.stages[-1]._java_obj.getMaxIter())
        mlflow.log_param(
            "reg_param",
            cv_best_pipeline_model.stages[-1]._java_obj.getRegParam())

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("area_under_ROC", area_under_ROC)

        cv_runid = mlflow.active_run().info.run_uuid
        cv_artifactUri = mlflow.get_artifact_uri()

        logger.warn("\ncv_runid: " + str(cv_runid))
        logger.warn("\ncv_artifactUri: " + str(cv_artifactUri))

        return cv_runid, cv_artifactUri
示例#22
0
def test_sparkml_model_save_without_specified_conda_env_uses_default_env_with_expected_dependencies(
        spark_model_iris, model_path):
    sparkm.save_model(spark_model=spark_model_iris.model, path=model_path)
    _assert_pip_requirements(model_path, sparkm.get_default_pip_requirements())
示例#23
0
def test_transformer_model_export(spark_model_transformer, model_path, spark_custom_env):
    with pytest.raises(MlflowException) as e:
        sparkm.save_model(
            spark_model_transformer.model, path=model_path, conda_env=spark_custom_env
        )
    assert "Cannot serialize this model" in e.value.message