def test_pipeline(dataset_text): mlflow.pyspark.ml.autolog() tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) inner_pipeline = Pipeline(stages=[hashingTF, lr]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) for estimator in [pipeline, nested_pipeline]: with mlflow.start_run() as run: model = estimator.fit(dataset_text) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values( _get_instance_param_map(estimator, uid_to_indexed_name_map))) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.uid == model.uid assert run_data.artifacts == ["estimator_info.json", "model"]
def test_pipeline(dataset_text): mlflow.pyspark.ml.autolog() tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) inner_pipeline = Pipeline(stages=[hashingTF, lr]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) assert _get_pipeline_stage_hierarchy(pipeline) == { pipeline.uid: [tokenizer.uid, hashingTF.uid, lr.uid] } assert _get_pipeline_stage_hierarchy(nested_pipeline) == { nested_pipeline.uid: [tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, lr.uid] }] } for estimator in [pipeline, nested_pipeline]: with mlflow.start_run() as run: model = estimator.fit(dataset_text) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(estimator))) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.uid == model.uid assert run_data.artifacts == ["model", "pipeline_hierarchy.json"]
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = _get_instance_param_map(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == lor.uid assert param_map[f"{lor.uid}.maxIter"] == 3 assert not param_map[f"{lor.uid}.standardization"] assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(ova)))
def test_fit_with_params(dataset_binomial): mlflow.pyspark.ml.autolog() lr = LinearRegression() extra_params = {lr.maxIter: 3, lr.standardization: False} with mlflow.start_run() as run: lr_model = lr.fit(dataset_binomial, params=extra_params) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(lr.copy(extra_params)))) assert run_data.tags == get_expected_class_tags(lr) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == lr_model.uid
def test_get_instance_param_map(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = _get_instance_param_map(lor) assert (lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family)) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = _get_instance_param_map(ova) assert (ova_params["classifier"] == lor.uid and ova_params["labelCol"] == "abcd" and ova_params[f"{lor.uid}.maxIter"] == 3 and ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family)) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = _get_instance_param_map(pipeline) nested_pipeline_params = _get_instance_param_map(nested_pipeline) assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid] assert nested_pipeline_params["stages"] == [ tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, ova.uid] }, ] for params_to_test in [pipeline_params, nested_pipeline_params]: assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text" and params_to_test[f"{tokenizer.uid}.outputCol"] == "words") assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features" assert params_to_test[f"{ova.uid}.classifier"] == lor.uid assert params_to_test[f"{lor.uid}.maxIter"] == 3
def test_meta_estimator_fit(dataset_binomial): mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: svc = LinearSVC() ova = OneVsRest(classifier=svc) ova_model = ova.fit(dataset_binomial) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(ova))) assert run_data.tags == get_expected_class_tags(ova) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == ova_model.uid # assert no nested run spawned query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id) assert len(mlflow.search_runs([run.info.experiment_id])) == 1 assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0
def test_basic_estimator(dataset_binomial): mlflow.pyspark.ml.autolog() for estimator in [ LinearRegression(), MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123, blockSize=1), ]: with mlflow.start_run() as run: model = estimator.fit(dataset_binomial) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(estimator))) assert run_data.tags == get_expected_class_tags(estimator) if isinstance(estimator, MultilayerPerceptronClassifier): assert MODEL_DIR not in run_data.artifacts else: assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == model.uid
def test_param_search_estimator( # pylint: disable=unused-argument metric_name, param_search_estimator, spark_session, dataset_regression): mlflow.pyspark.ml.autolog() lr = LinearRegression(solver="l-bfgs", regParam=0.01) lrParamMaps = [ { lr.maxIter: 1, lr.standardization: False }, { lr.maxIter: 200, lr.standardization: True }, { lr.maxIter: 2, lr.standardization: False }, ] best_params = { "LinearRegression.maxIter": 200, "LinearRegression.standardization": True } eva = RegressionEvaluator(metricName=metric_name) estimator = param_search_estimator(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva) with mlflow.start_run() as run: model = estimator.fit(dataset_regression) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] param_search_estiamtor_info = estimator_info[ metadata.uid_to_indexed_name_map[estimator.uid]] assert param_search_estiamtor_info[ "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively( lr, 1, metadata.uid_to_indexed_name_map) assert param_search_estiamtor_info[ "tuning_parameter_map_list"] == _get_tuning_param_maps( estimator, metadata.uid_to_indexed_name_map) assert best_params == load_json_artifact("best_parameters.json") search_results = load_json_csv("search_results.csv") uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(estimator, uid_to_indexed_name_map), **{f"best_{k}": v for k, v in best_params.items()}, })) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == model.uid loaded_best_model = load_model_by_run_id(run_id, "best_model") assert loaded_best_model.stages[0].uid == model.bestModel.uid assert run_data.artifacts == [ "best_model", "best_parameters.json", "estimator_info.json", "model", "search_results.csv", ] client = mlflow.tracking.MlflowClient() child_runs = client.search_runs( run.info.experiment_id, "tags.`mlflow.parentRunId` = '{}'".format(run_id)) assert len(child_runs) == len(search_results) for row_index, row in search_results.iterrows(): row_params = json.loads(row.get("params", "{}")) for param_name, param_value in row_params.items(): assert param_value == row.get(f"param.{param_name}") params_search_clause = " and ".join([ "params.`{}` = '{}'".format(key.split(".")[1], value) for key, value in row_params.items() ]) search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format( run_id, params_search_clause) child_runs = client.search_runs(run.info.experiment_id, search_filter) assert len(child_runs) == 1 child_run = child_runs[0] assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED) run_data = get_run_data(child_run.info.run_id) child_estimator = estimator.getEstimator().copy( estimator.getEstimatorParamMaps()[row_index]) assert run_data.tags == get_expected_class_tags(child_estimator) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(child_estimator, uid_to_indexed_name_map) })) assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) == mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME) metric_name = estimator.getEvaluator().getMetricName() if isinstance(estimator, CrossValidator): avg_metric_value = model.avgMetrics[row_index] avg_metric_name = f"avg_{metric_name}" else: avg_metric_value = model.validationMetrics[row_index] avg_metric_name = metric_name assert math.isclose(avg_metric_value, run_data.metrics[avg_metric_name], rel_tol=1e-6) assert math.isclose(avg_metric_value, float(row.get(avg_metric_name)), rel_tol=1e-6) if isinstance(estimator, CrossValidator) and Version( pyspark.__version__) >= Version("3.3"): std_metric_name = f"std_{metric_name}" std_metric_value = model.stdMetrics[row_index] assert math.isclose(std_metric_value, run_data.metrics[std_metric_name], rel_tol=1e-6) assert math.isclose(std_metric_value, float(row.get(std_metric_name)), rel_tol=1e-6)
def get_params_to_log(estimator): metadata = _gen_estimator_metadata(estimator) return _get_instance_param_map(estimator, metadata.uid_to_indexed_name_map)