def test_pipeline(dataset_text):
    mlflow.pyspark.ml.autolog()

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    inner_pipeline = Pipeline(stages=[hashingTF, lr])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    for estimator in [pipeline, nested_pipeline]:
        with mlflow.start_run() as run:
            model = estimator.fit(dataset_text)
            estimator_info = load_json_artifact("estimator_info.json")
            metadata = _gen_estimator_metadata(estimator)
            assert metadata.hierarchy == estimator_info["hierarchy"]

        uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
        run_id = run.info.run_id
        run_data = get_run_data(run_id)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values(
                _get_instance_param_map(estimator, uid_to_indexed_name_map)))
        assert run_data.tags == get_expected_class_tags(estimator)
        assert MODEL_DIR in run_data.artifacts
        loaded_model = load_model_by_run_id(run_id)
        assert loaded_model.uid == model.uid
        assert run_data.artifacts == ["estimator_info.json", "model"]
def test_param_map_captures_wrapped_params(dataset_binomial):
    lor = LogisticRegression(maxIter=3, standardization=False)
    ova = OneVsRest(classifier=lor, labelCol="abcd")

    param_map = get_params_to_log(ova)
    assert param_map["labelCol"] == "abcd"
    assert param_map["classifier"] == "LogisticRegression"
    assert param_map["LogisticRegression.maxIter"] == 3
    assert not param_map["LogisticRegression.standardization"]
    assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol)

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label))
        metadata = _gen_estimator_metadata(ova)
        estimator_info = load_json_artifact("estimator_info.json")
        assert metadata.hierarchy == estimator_info["hierarchy"]
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")
def test_param_search_estimator(  # pylint: disable=unused-argument
        metric_name, param_search_estimator, spark_session,
        dataset_regression):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression(solver="l-bfgs", regParam=0.01)
    lrParamMaps = [
        {
            lr.maxIter: 1,
            lr.standardization: False
        },
        {
            lr.maxIter: 200,
            lr.standardization: True
        },
        {
            lr.maxIter: 2,
            lr.standardization: False
        },
    ]
    best_params = {
        "LinearRegression.maxIter": 200,
        "LinearRegression.standardization": True
    }
    eva = RegressionEvaluator(metricName=metric_name)
    estimator = param_search_estimator(estimator=lr,
                                       estimatorParamMaps=lrParamMaps,
                                       evaluator=eva)
    with mlflow.start_run() as run:
        model = estimator.fit(dataset_regression)
        estimator_info = load_json_artifact("estimator_info.json")
        metadata = _gen_estimator_metadata(estimator)
        assert metadata.hierarchy == estimator_info["hierarchy"]

        param_search_estiamtor_info = estimator_info[
            metadata.uid_to_indexed_name_map[estimator.uid]]
        assert param_search_estiamtor_info[
            "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively(
                lr, 1, metadata.uid_to_indexed_name_map)
        assert param_search_estiamtor_info[
            "tuning_parameter_map_list"] == _get_tuning_param_maps(
                estimator, metadata.uid_to_indexed_name_map)

        assert best_params == load_json_artifact("best_parameters.json")

        search_results = load_json_csv("search_results.csv")

    uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values({
            **_get_instance_param_map(estimator, uid_to_indexed_name_map),
            **{f"best_{k}": v
               for k, v in best_params.items()},
        }))
    assert run_data.tags == get_expected_class_tags(estimator)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == model.uid
    loaded_best_model = load_model_by_run_id(run_id, "best_model")
    assert loaded_best_model.stages[0].uid == model.bestModel.uid
    assert run_data.artifacts == [
        "best_model",
        "best_parameters.json",
        "estimator_info.json",
        "model",
        "search_results.csv",
    ]

    client = mlflow.tracking.MlflowClient()
    child_runs = client.search_runs(
        run.info.experiment_id,
        "tags.`mlflow.parentRunId` = '{}'".format(run_id))
    assert len(child_runs) == len(search_results)

    for row_index, row in search_results.iterrows():
        row_params = json.loads(row.get("params", "{}"))
        for param_name, param_value in row_params.items():
            assert param_value == row.get(f"param.{param_name}")

        params_search_clause = " and ".join([
            "params.`{}` = '{}'".format(key.split(".")[1], value)
            for key, value in row_params.items()
        ])
        search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format(
            run_id, params_search_clause)
        child_runs = client.search_runs(run.info.experiment_id, search_filter)
        assert len(child_runs) == 1
        child_run = child_runs[0]
        assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED)
        run_data = get_run_data(child_run.info.run_id)
        child_estimator = estimator.getEstimator().copy(
            estimator.getEstimatorParamMaps()[row_index])
        assert run_data.tags == get_expected_class_tags(child_estimator)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values({
                **_get_instance_param_map(child_estimator, uid_to_indexed_name_map)
            }))
        assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) ==
                mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME)

        metric_name = estimator.getEvaluator().getMetricName()
        if isinstance(estimator, CrossValidator):
            avg_metric_value = model.avgMetrics[row_index]
            avg_metric_name = f"avg_{metric_name}"
        else:
            avg_metric_value = model.validationMetrics[row_index]
            avg_metric_name = metric_name

        assert math.isclose(avg_metric_value,
                            run_data.metrics[avg_metric_name],
                            rel_tol=1e-6)
        assert math.isclose(avg_metric_value,
                            float(row.get(avg_metric_name)),
                            rel_tol=1e-6)

        if isinstance(estimator, CrossValidator) and Version(
                pyspark.__version__) >= Version("3.3"):
            std_metric_name = f"std_{metric_name}"
            std_metric_value = model.stdMetrics[row_index]
            assert math.isclose(std_metric_value,
                                run_data.metrics[std_metric_name],
                                rel_tol=1e-6)
            assert math.isclose(std_metric_value,
                                float(row.get(std_metric_name)),
                                rel_tol=1e-6)
def test_log_stage_type_params(spark_session):
    from pyspark.ml.base import Estimator, Transformer, Model
    from pyspark.ml.evaluation import Evaluator
    from pyspark.ml.param import Param, Params
    from pyspark.ml.feature import Binarizer, OneHotEncoder

    class TestingEstimator(Estimator):

        transformer = Param(Params._dummy(), "transformer",
                            "a transformer param")
        model = Param(Params._dummy(), "model", "a model param")
        evaluator = Param(Params._dummy(), "evaluator", "an evaluator param")

        def setTransformer(self, transformer: Transformer):
            return self._set(transformer=transformer)

        def setModel(self, model: Model):
            return self._set(model=model)

        def setEvaluator(self, evaluator: Evaluator):
            return self._set(evaluator=evaluator)

        def _fit(self, dataset):
            return TestingModel()

    class TestingModel(Model):
        def _transform(self, dataset):
            return dataset

    binarizer = Binarizer(threshold=1.0,
                          inputCol="values",
                          outputCol="features")
    df = spark_session.createDataFrame([(0.0, ), (1.0, ), (2.0, )], ["input"])
    ohe = OneHotEncoder().setInputCols(["input"]).setOutputCols(["output"])
    ohemodel = ohe.fit(df)
    bcd = BinaryClassificationEvaluator(metricName="areaUnderROC")

    estimator = TestingEstimator().setTransformer(binarizer).setModel(
        ohemodel).setEvaluator(bcd)
    param_map = get_params_to_log(estimator)
    assert param_map["transformer"] == "Binarizer"
    assert param_map["model"] == "OneHotEncoderModel"
    assert param_map["evaluator"] == "BinaryClassificationEvaluator"

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        estimator.fit(df)
        metadata = _gen_estimator_metadata(estimator)
        estimator_info = load_json_artifact("estimator_info.json")
        assert metadata.hierarchy == estimator_info["hierarchy"]
        assert isinstance(estimator_info["hierarchy"]["params"], dict)
        assert estimator_info["hierarchy"]["params"]["transformer"][
            "name"] == "Binarizer"
        assert estimator_info["hierarchy"]["params"]["model"][
            "name"] == "OneHotEncoderModel"
        assert (estimator_info["hierarchy"]["params"]["evaluator"]["name"] ==
                "BinaryClassificationEvaluator")
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(get_params_to_log(estimator)))
def get_params_to_log(estimator):
    metadata = _gen_estimator_metadata(estimator)
    return _get_instance_param_map(estimator, metadata.uid_to_indexed_name_map)