Пример #1
0
def test_custom_model_save_load(custom_model, custom_layer, data,
                                custom_predicted, model_path):
    x, _ = data
    custom_objects = {"MyDense": custom_layer}
    mlflow.keras.save_model(custom_model,
                            model_path,
                            custom_objects=custom_objects)

    # Loading Keras model
    model_loaded = mlflow.keras.load_model(model_path)
    assert all(model_loaded.predict(x) == custom_predicted)
    # pyfunc serve
    scoring_response = pyfunc_serve_and_score_model(
        model_uri=os.path.abspath(model_path),
        data=pd.DataFrame(x),
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED,
    )
    assert np.allclose(
        pd.read_json(scoring_response.content,
                     orient="records",
                     encoding="utf8").values.astype(np.float32),
        custom_predicted,
        rtol=1e-5,
        atol=1e-9,
    )
    # Loading pyfunc model
    pyfunc_loaded = mlflow.pyfunc.load_model(model_path)
    assert all(pyfunc_loaded.predict(x).values == custom_predicted)
    # test spark udf
    spark_udf_preds = score_model_as_udf(model_uri=os.path.abspath(model_path),
                                         pandas_df=pd.DataFrame(x),
                                         result_type="float")
    np.allclose(np.array(spark_udf_preds),
                custom_predicted.reshape(len(spark_udf_preds)))
Пример #2
0
def test_model_save_load(model, model_path, data, predicted):
    x, y = data
    mlflow.keras.save_model(model, model_path)

    # Loading Keras model
    model_loaded = mlflow.keras.load_model(model_path)
    assert all(model_loaded.predict(x) == predicted)

    # Loading pyfunc model
    pyfunc_loaded = mlflow.pyfunc.load_pyfunc(model_path)
    assert all(pyfunc_loaded.predict(x).values == predicted)

    # pyfunc serve
    scoring_response = pyfunc_serve_and_score_model(
        model_path=os.path.abspath(model_path),
        data=pd.DataFrame(x),
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED)
    assert all(pd.read_json(scoring_response.content, orient="records").values.astype(np.float32)
               == predicted)

    # test spark udf
    spark_udf_preds = score_model_as_udf(os.path.abspath(model_path),
                                         run_id=None,
                                         pandas_df=pd.DataFrame(x),
                                         result_type="float")
    np.testing.assert_array_almost_equal(
        np.array(spark_udf_preds), predicted.reshape(len(spark_udf_preds)), decimal=6)
Пример #3
0
def test_model_save_load(build_model, model_path, data):
    x, _ = data
    keras_model = build_model(data)
    if build_model == tf_keras_model:
        model_path = os.path.join(model_path, "tf")
    else:
        model_path = os.path.join(model_path, "plain")
    expected = keras_model.predict(x)
    mlflow.keras.save_model(keras_model, model_path)
    # Loading Keras model
    model_loaded = mlflow.keras.load_model(model_path)
    assert type(keras_model) == type(model_loaded)
    assert all(expected == model_loaded.predict(x))
    # Loading pyfunc model
    pyfunc_loaded = mlflow.pyfunc.load_model(model_path)
    assert all(pyfunc_loaded.predict(x).values == expected)

    # pyfunc serve
    scoring_response = pyfunc_serve_and_score_model(
        model_uri=os.path.abspath(model_path),
        data=pd.DataFrame(x),
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED)
    assert all(
        pd.read_json(scoring_response.content,
                     orient="records",
                     encoding="utf8").values.astype(np.float32) == expected)
    # test spark udf
    spark_udf_preds = score_model_as_udf(model_uri=os.path.abspath(model_path),
                                         pandas_df=pd.DataFrame(x),
                                         result_type="float")
    np.allclose(np.array(spark_udf_preds),
                expected.reshape(len(spark_udf_preds)))
Пример #4
0
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data, columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path, spark_model=model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path, run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # We expect not to delete the DFS tempdir.
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert os.listdir(x)
                shutil.rmtree(x)
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
Пример #5
0
def test_model_log_with_sparkml_format(tmpdir, spark_model_iris):
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = spark_model_iris.model.transform(spark_model_iris.training_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =",
                  dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path,
                                 spark_model=spark_model_iris.model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(spark_model_iris.inference_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path,
                                                   run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df_1 = reloaded_model.transform(
                    spark_model_iris.training_df)
                preds3 = [
                    x.prediction
                    for x in preds_df_1.select("prediction").collect()
                ]
                assert preds1 == preds3
                # test spark_udf
                preds4 = score_model_as_udf(artifact_path, run_id,
                                            spark_model_iris.inference_df)
                assert preds1 == preds4
                # We expect not to delete the DFS tempdir.
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert os.listdir(x)
                shutil.rmtree(x)
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
Пример #6
0
def test_model_export(spark_model_iris, model_path, spark_custom_env):
    sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env)
    # 1. score and compare reloaded sparkml model
    reloaded_model = sparkm.load_model(model_uri=model_path)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds1
    m = pyfunc.load_pyfunc(model_path)
    # 2. score and compare reloaded pyfunc
    preds2 = m.predict(spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds2
    # 3. score and compare reloaded pyfunc Spark udf
    preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds3
    assert os.path.exists(sparkm.DFS_TMP)
Пример #7
0
def test_model_save_load(build_model, save_format, model_path, data):
    x, _ = data
    keras_model = build_model(data)
    if build_model == get_tf_keras_model:
        model_path = os.path.join(model_path, "tf")
    else:
        model_path = os.path.join(model_path, "plain")
    expected = keras_model.predict(x.values)
    kwargs = {"save_format": save_format} if save_format else {}
    mlflow.keras.save_model(keras_model, model_path, **kwargs)
    # Loading Keras model
    model_loaded = mlflow.keras.load_model(model_path)
    # When saving as SavedModel, we actually convert the model
    # to a slightly different format, so we cannot assume it is
    # exactly the same.
    if save_format != "tf":
        assert type(keras_model) == type(model_loaded)
    np.testing.assert_allclose(model_loaded.predict(x.values),
                               expected,
                               rtol=1e-5)
    # Loading pyfunc model
    pyfunc_loaded = mlflow.pyfunc.load_model(model_path)
    np.testing.assert_allclose(pyfunc_loaded.predict(x).values,
                               expected,
                               rtol=1e-5)

    # pyfunc serve
    scoring_response = pyfunc_serve_and_score_model(
        model_uri=os.path.abspath(model_path),
        data=pd.DataFrame(x),
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED,
        extra_args=EXTRA_PYFUNC_SERVING_TEST_ARGS,
    )
    print(scoring_response.content)
    actual_scoring_response = pd.read_json(
        scoring_response.content.decode("utf-8"),
        orient="records",
        encoding="utf8").values.astype(np.float32)
    np.testing.assert_allclose(actual_scoring_response, expected, rtol=1e-5)

    # test spark udf
    spark_udf_preds = score_model_as_udf(model_uri=os.path.abspath(model_path),
                                         pandas_df=pd.DataFrame(x),
                                         result_type="float")
    np.allclose(np.array(spark_udf_preds),
                expected.reshape(len(spark_udf_preds)))
Пример #8
0
def test_model_export(spark_model_iris, model_path, spark_conda_env):
    sparkm.save_model(spark_model_iris.model,
                      path=model_path,
                      conda_env=spark_conda_env)
    # 1. score and compare reloaded sparkml model
    reloaded_model = sparkm.load_model(path=model_path)
    preds_df = reloaded_model.transform(spark_model_iris.spark_df)

    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    assert spark_model_iris.predictions == preds1
    m = pyfunc.load_pyfunc(model_path)
    # 2. score and compare reloaded pyfunc
    preds2 = m.predict(spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds2
    # 3. score and compare reloaded pyfunc Spark udf
    preds3 = score_model_as_udf(model_path,
                                run_id=None,
                                pandas_df=spark_model_iris.pandas_df)
    assert spark_model_iris.predictions == preds3
    assert os.path.exists(sparkm.DFS_TMP)
    print(os.listdir(sparkm.DFS_TMP))
    # We expect not to delete the DFS tempdir.
    assert os.listdir(sparkm.DFS_TMP)