def test_pipeline(dataset_text): mlflow.pyspark.ml.autolog() tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) inner_pipeline = Pipeline(stages=[hashingTF, lr]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) for estimator in [pipeline, nested_pipeline]: with mlflow.start_run() as run: model = estimator.fit(dataset_text) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values( _get_instance_param_map(estimator, uid_to_indexed_name_map))) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.uid == model.uid assert run_data.artifacts == ["estimator_info.json", "model"]
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = get_params_to_log(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == "LogisticRegression" assert param_map["LogisticRegression.maxIter"] == 3 assert not param_map["LogisticRegression.standardization"] assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) metadata = _gen_estimator_metadata(ova) estimator_info = load_json_artifact("estimator_info.json") assert metadata.hierarchy == estimator_info["hierarchy"] run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
def test_gen_estimator_metadata(spark_session): # pylint: disable=unused-argument tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1") hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(), outputCol="features1") tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2") hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(), outputCol="features2") vecAssembler = VectorAssembler(inputCols=["features1", "features2"], outputCol="features") lor = LogisticRegression(maxIter=10) ova = OneVsRest(classifier=lor) sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1]) sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2]) sub_pipeline3 = Pipeline(stages=[vecAssembler, ova]) paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid( lor.regParam, [0.1, 0.01]).build()) eva = MulticlassClassificationEvaluator() crossval = CrossValidator(estimator=sub_pipeline3, estimatorParamMaps=paramGrid, evaluator=eva, numFolds=2) top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval]) metadata = _gen_estimator_metadata(top_pipeline) expected_hierarchy = { "name": "Pipeline_1", "stages": [ { "name": "Pipeline_2", "stages": [{ "name": "Tokenizer_1" }, { "name": "HashingTF_1" }] }, { "name": "Pipeline_3", "stages": [{ "name": "Tokenizer_2" }, { "name": "HashingTF_2" }] }, { "name": "CrossValidator", "evaluator": { "name": "MulticlassClassificationEvaluator" }, "tuned_estimator": { "name": "Pipeline_4", "stages": [ { "name": "VectorAssembler" }, { "name": "OneVsRest", "classifier": { "name": "LogisticRegression" } }, ], }, }, ], } assert metadata.hierarchy == expected_hierarchy assert metadata.uid_to_indexed_name_map == { top_pipeline.uid: "Pipeline_1", sub_pipeline1.uid: "Pipeline_2", tokenizer1.uid: "Tokenizer_1", hashingTF1.uid: "HashingTF_1", sub_pipeline2.uid: "Pipeline_3", tokenizer2.uid: "Tokenizer_2", hashingTF2.uid: "HashingTF_2", crossval.uid: "CrossValidator", sub_pipeline3.uid: "Pipeline_4", vecAssembler.uid: "VectorAssembler", ova.uid: "OneVsRest", lor.uid: "LogisticRegression", eva.uid: "MulticlassClassificationEvaluator", } assert (metadata.uid_to_indexed_name_map[ metadata.param_search_estimators[0].uid] == "CrossValidator")
def test_param_search_estimator( # pylint: disable=unused-argument metric_name, param_search_estimator, spark_session, dataset_regression): mlflow.pyspark.ml.autolog() lr = LinearRegression(solver="l-bfgs", regParam=0.01) lrParamMaps = [ { lr.maxIter: 1, lr.standardization: False }, { lr.maxIter: 200, lr.standardization: True }, { lr.maxIter: 2, lr.standardization: False }, ] best_params = { "LinearRegression.maxIter": 200, "LinearRegression.standardization": True } eva = RegressionEvaluator(metricName=metric_name) estimator = param_search_estimator(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva) with mlflow.start_run() as run: model = estimator.fit(dataset_regression) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] param_search_estiamtor_info = estimator_info[ metadata.uid_to_indexed_name_map[estimator.uid]] assert param_search_estiamtor_info[ "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively( lr, 1, metadata.uid_to_indexed_name_map) assert param_search_estiamtor_info[ "tuning_parameter_map_list"] == _get_tuning_param_maps( estimator, metadata.uid_to_indexed_name_map) assert best_params == load_json_artifact("best_parameters.json") search_results = load_json_csv("search_results.csv") uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(estimator, uid_to_indexed_name_map), **{f"best_{k}": v for k, v in best_params.items()}, })) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == model.uid loaded_best_model = load_model_by_run_id(run_id, "best_model") assert loaded_best_model.stages[0].uid == model.bestModel.uid assert run_data.artifacts == [ "best_model", "best_parameters.json", "estimator_info.json", "model", "search_results.csv", ] client = mlflow.tracking.MlflowClient() child_runs = client.search_runs( run.info.experiment_id, "tags.`mlflow.parentRunId` = '{}'".format(run_id)) assert len(child_runs) == len(search_results) for row_index, row in search_results.iterrows(): row_params = json.loads(row.get("params", "{}")) for param_name, param_value in row_params.items(): assert param_value == row.get(f"param.{param_name}") params_search_clause = " and ".join([ "params.`{}` = '{}'".format(key.split(".")[1], value) for key, value in row_params.items() ]) search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format( run_id, params_search_clause) child_runs = client.search_runs(run.info.experiment_id, search_filter) assert len(child_runs) == 1 child_run = child_runs[0] assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED) run_data = get_run_data(child_run.info.run_id) child_estimator = estimator.getEstimator().copy( estimator.getEstimatorParamMaps()[row_index]) assert run_data.tags == get_expected_class_tags(child_estimator) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(child_estimator, uid_to_indexed_name_map) })) assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) == mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME) metric_name = estimator.getEvaluator().getMetricName() if isinstance(estimator, CrossValidator): avg_metric_value = model.avgMetrics[row_index] avg_metric_name = f"avg_{metric_name}" else: avg_metric_value = model.validationMetrics[row_index] avg_metric_name = metric_name assert math.isclose(avg_metric_value, run_data.metrics[avg_metric_name], rel_tol=1e-6) assert math.isclose(avg_metric_value, float(row.get(avg_metric_name)), rel_tol=1e-6) if isinstance(estimator, CrossValidator) and Version( pyspark.__version__) >= Version("3.3"): std_metric_name = f"std_{metric_name}" std_metric_value = model.stdMetrics[row_index] assert math.isclose(std_metric_value, run_data.metrics[std_metric_name], rel_tol=1e-6) assert math.isclose(std_metric_value, float(row.get(std_metric_name)), rel_tol=1e-6)
def test_log_stage_type_params(spark_session): from pyspark.ml.base import Estimator, Transformer, Model from pyspark.ml.evaluation import Evaluator from pyspark.ml.param import Param, Params from pyspark.ml.feature import Binarizer, OneHotEncoder class TestingEstimator(Estimator): transformer = Param(Params._dummy(), "transformer", "a transformer param") model = Param(Params._dummy(), "model", "a model param") evaluator = Param(Params._dummy(), "evaluator", "an evaluator param") def setTransformer(self, transformer: Transformer): return self._set(transformer=transformer) def setModel(self, model: Model): return self._set(model=model) def setEvaluator(self, evaluator: Evaluator): return self._set(evaluator=evaluator) def _fit(self, dataset): return TestingModel() class TestingModel(Model): def _transform(self, dataset): return dataset binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") df = spark_session.createDataFrame([(0.0, ), (1.0, ), (2.0, )], ["input"]) ohe = OneHotEncoder().setInputCols(["input"]).setOutputCols(["output"]) ohemodel = ohe.fit(df) bcd = BinaryClassificationEvaluator(metricName="areaUnderROC") estimator = TestingEstimator().setTransformer(binarizer).setModel( ohemodel).setEvaluator(bcd) param_map = get_params_to_log(estimator) assert param_map["transformer"] == "Binarizer" assert param_map["model"] == "OneHotEncoderModel" assert param_map["evaluator"] == "BinaryClassificationEvaluator" mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: estimator.fit(df) metadata = _gen_estimator_metadata(estimator) estimator_info = load_json_artifact("estimator_info.json") assert metadata.hierarchy == estimator_info["hierarchy"] assert isinstance(estimator_info["hierarchy"]["params"], dict) assert estimator_info["hierarchy"]["params"]["transformer"][ "name"] == "Binarizer" assert estimator_info["hierarchy"]["params"]["model"][ "name"] == "OneHotEncoderModel" assert (estimator_info["hierarchy"]["params"]["evaluator"]["name"] == "BinaryClassificationEvaluator") run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(get_params_to_log(estimator)))
def get_params_to_log(estimator): metadata = _gen_estimator_metadata(estimator) return _get_instance_param_map(estimator, metadata.uid_to_indexed_name_map)