def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset): with mlflow.start_run() as run: result = evaluate( multiclass_logistic_regressor_model_uri, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="default", ) _, metrics, tags, artifacts = get_run_data(run.info.run_id) model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn( model) y = iris_dataset.labels_data y_pred = predict_fn(iris_dataset.features_data) y_probs = predict_proba_fn(iris_dataset.features_data) expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs, labels=None) for metric_key in expected_metrics: assert np.isclose(expected_metrics[metric_key], metrics[metric_key + "_on_data_iris_dataset"], rtol=1e-3) assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) assert json.loads(tags["mlflow.datasets"]) == [{ **iris_dataset._metadata, "model": model.metadata.model_uuid }] assert set(artifacts) == { "shap_beeswarm_plot_on_data_iris_dataset.png", "per_class_metrics_on_data_iris_dataset.csv", "roc_curve_plot_on_data_iris_dataset.png", "precision_recall_curve_plot_on_data_iris_dataset.png", "shap_feature_importance_plot_on_data_iris_dataset.png", "explainer_on_data_iris_dataset", "confusion_matrix_on_data_iris_dataset.png", "shap_summary_plot_on_data_iris_dataset.png", } assert result.artifacts.keys() == { "per_class_metrics", "roc_curve_plot", "precision_recall_curve_plot", "confusion_matrix", "shap_beeswarm_plot", "shap_summary_plot", "shap_feature_importance_plot", }
def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset): y_true = diabetes_dataset.labels_data regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri) y_pred = regressor_model.predict(diabetes_dataset.features_data) expected_mae = mean_absolute_error(y_true, y_pred) expected_mse = mean_squared_error(y_true, y_pred) expected_metrics = { "mean_absolute_error": expected_mae, "mean_squared_error": expected_mse, } expected_saved_metrics = { "mean_absolute_error_on_diabetes_dataset": expected_mae, "mean_squared_error_on_diabetes_dataset": expected_mse, } for model in [regressor_model, linear_regressor_model_uri]: with mlflow.start_run() as run: eval_result = evaluate( model, diabetes_dataset._constructor_args["data"], model_type="regressor", targets=diabetes_dataset._constructor_args["targets"], dataset_name=diabetes_dataset.name, evaluators="dummy_evaluator", ) _, saved_metrics, _, _ = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert eval_result.metrics == expected_metrics
def test_pandas_df_regressor_evaluation(linear_regressor_model_uri): data = sklearn.datasets.load_diabetes() df = pd.DataFrame(data.data, columns=data.feature_names) df["y"] = data.target regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri) dataset_name = "diabetes_pd" for model in [regressor_model, linear_regressor_model_uri]: with mlflow.start_run() as run: eval_result = evaluate( model, data=df, targets="y", model_type="regressor", dataset_name=dataset_name, evaluators=["default"], ) _, saved_metrics, _, _ = get_run_data(run.info.run_id) augment_name = f"_on_data_{dataset_name}" for k, v in eval_result.metrics.items(): assert v == saved_metrics[f"{k}{augment_name}"]
def test_custom_metric(binary_logistic_regressor_model_uri, breast_cancer_dataset): def example_custom_metric(eval_df, given_metrics): return { "true_count": given_metrics["true_negatives"] + given_metrics["true_positives"], "positive_count": np.sum(eval_df["prediction"]), } with mlflow.start_run() as run: result = evaluate( binary_logistic_regressor_model_uri, breast_cancer_dataset._constructor_args["data"], model_type="classifier", targets=breast_cancer_dataset._constructor_args["targets"], dataset_name=breast_cancer_dataset.name, evaluators="default", custom_metrics=[example_custom_metric], ) _, metrics, _, _ = get_run_data(run.info.run_id) model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri) _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model) y = breast_cancer_dataset.labels_data y_pred = predict_fn(breast_cancer_dataset.features_data) expected_metrics = _get_classifier_per_class_metrics(y, y_pred) assert "true_count_on_data_breast_cancer_dataset" in metrics assert np.isclose( metrics["true_count_on_data_breast_cancer_dataset"], expected_metrics["true_negatives"] + expected_metrics["true_positives"], rtol=1e-3, ) assert "true_count" in result.metrics assert np.isclose( result.metrics["true_count"], expected_metrics["true_negatives"] + expected_metrics["true_positives"], rtol=1e-3, ) assert "positive_count_on_data_breast_cancer_dataset" in metrics assert np.isclose(metrics["positive_count_on_data_breast_cancer_dataset"], np.sum(y_pred), rtol=1e-3) assert "positive_count" in result.metrics assert np.isclose(result.metrics["positive_count"], np.sum(y_pred), rtol=1e-3)
def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset): with mlflow.start_run() as run: result = evaluate( svm_model_uri, breast_cancer_dataset._constructor_args["data"], model_type="classifier", targets=breast_cancer_dataset._constructor_args["targets"], dataset_name=breast_cancer_dataset.name, evaluators="default", ) _, metrics, tags, artifacts = get_run_data(run.info.run_id) model = mlflow.pyfunc.load_model(svm_model_uri) _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model) y = breast_cancer_dataset.labels_data y_pred = predict_fn(breast_cancer_dataset.features_data) expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None, labels=None) for metric_key in expected_metrics: assert np.isclose( expected_metrics[metric_key], metrics[metric_key + "_on_data_breast_cancer_dataset"], rtol=1e-3, ) assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) assert json.loads(tags["mlflow.datasets"]) == [{ **breast_cancer_dataset._metadata, "model": model.metadata.model_uuid }] assert set(artifacts) == { "confusion_matrix_on_data_breast_cancer_dataset.png", "shap_feature_importance_plot_on_data_breast_cancer_dataset.png", "shap_beeswarm_plot_on_data_breast_cancer_dataset.png", "shap_summary_plot_on_data_breast_cancer_dataset.png", } assert result.artifacts.keys() == { "confusion_matrix", "shap_beeswarm_plot", "shap_summary_plot", "shap_feature_importance_plot", }
def _get_results_for_custom_metrics_tests(model_uri, dataset, custom_metrics): with mlflow.start_run() as run: result = evaluate( model_uri, dataset._constructor_args["data"], model_type="classifier", targets=dataset._constructor_args["targets"], dataset_name=dataset.name, evaluators="default", custom_metrics=custom_metrics, ) _, metrics, _, artifacts = get_run_data(run.info.run_id) return result, metrics, artifacts
def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset): with mlflow.start_run() as run: result = evaluate( linear_regressor_model_uri, diabetes_dataset._constructor_args["data"], model_type="regressor", targets=diabetes_dataset._constructor_args["targets"], dataset_name=diabetes_dataset.name, evaluators="default", ) _, metrics, tags, artifacts = get_run_data(run.info.run_id) model = mlflow.pyfunc.load_model(linear_regressor_model_uri) y = diabetes_dataset.labels_data y_pred = model.predict(diabetes_dataset.features_data) expected_metrics = _get_regressor_metrics(y, y_pred) for metric_key in expected_metrics: assert np.isclose( expected_metrics[metric_key], metrics[metric_key + "_on_data_diabetes_dataset"], rtol=1e-3, ) assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) assert json.loads(tags["mlflow.datasets"]) == [{ **diabetes_dataset._metadata, "model": model.metadata.model_uuid }] assert set(artifacts) == { "shap_beeswarm_plot_on_data_diabetes_dataset.png", "shap_feature_importance_plot_on_data_diabetes_dataset.png", "shap_summary_plot_on_data_diabetes_dataset.png", } assert result.artifacts.keys() == { "shap_beeswarm_plot", "shap_feature_importance_plot", "shap_summary_plot", }
def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset): with mlflow.start_run() as run: result = evaluate( spark_linear_regressor_model_uri, diabetes_spark_dataset._constructor_args["data"], model_type="regressor", targets=diabetes_spark_dataset._constructor_args["targets"], dataset_name=diabetes_spark_dataset.name, evaluators="default", evaluator_config={"log_model_explainability": True}, ) _, metrics, tags, artifacts = get_run_data(run.info.run_id) model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri) X = diabetes_spark_dataset.features_data y = diabetes_spark_dataset.labels_data y_pred = model.predict(X) expected_metrics = _get_regressor_metrics(y, y_pred) for metric_key in expected_metrics: assert np.isclose( expected_metrics[metric_key], metrics[metric_key + "_on_data_diabetes_spark_dataset"], rtol=1e-3, ) assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri) assert json.loads(tags["mlflow.datasets"]) == [{ **diabetes_spark_dataset._metadata, "model": model.metadata.model_uuid }] assert set(artifacts) == set() assert result.artifacts == {}
def test_evaluate_with_multi_evaluators( multiclass_logistic_regressor_model_uri, iris_dataset): with mock.patch.object( _model_evaluation_registry, "_registry", { "test_evaluator1": FakeEvauator1, "test_evaluator2": FakeEvauator2 }, ): evaluator1_config = {"eval1_confg": 3} evaluator2_config = {"eval2_confg": 4} evaluator1_return_value = EvaluationResult( metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")}) evaluator2_return_value = EvaluationResult( metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")}) # evaluators = None is the case evaluators unspecified, it should fetch all registered # evaluators, and the evaluation results should equal to the case of # evaluators=["test_evaluator1", "test_evaluator2"] for evaluators in [None, ["test_evaluator1", "test_evaluator2"]]: with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=True ) as mock_can_evaluate1, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value ) as mock_evaluate1, mock.patch.object( FakeEvauator2, "can_evaluate", return_value=True ) as mock_can_evaluate2, mock.patch.object( FakeEvauator2, "evaluate", return_value=evaluator2_return_value) as mock_evaluate2: classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators=evaluators, evaluator_config={ "test_evaluator1": evaluator1_config, "test_evaluator2": evaluator2_config, }, ) assert eval_result.metrics == { **evaluator1_return_value.metrics, **evaluator2_return_value.metrics, } assert eval_result.artifacts == { **evaluator1_return_value.artifacts, **evaluator2_return_value.artifacts, } mock_can_evaluate1.assert_called_once_with( model_type="classifier", evaluator_config=evaluator1_config) mock_evaluate1.assert_called_once_with( model=classifier_model, model_type="classifier", dataset=iris_dataset, run_id=run.info.run_id, evaluator_config=evaluator1_config, ) mock_can_evaluate2.assert_called_once_with( model_type="classifier", evaluator_config=evaluator2_config, ) mock_evaluate2.assert_called_once_with( model=classifier_model, model_type="classifier", dataset=iris_dataset, run_id=run.info.run_id, evaluator_config=evaluator2_config, )
def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_dataset): with mock.patch.object(_model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvauator1}): evaluator1_config = {"eval1_confg_a": 3, "eval1_confg_b": 4} evaluator1_return_value = EvaluationResult( metrics={ "m1": 5, "m2": 6 }, artifacts={ "a1": FakeArtifact1(uri="uri1"), "a2": FakeArtifact2(uri="uri2") }, ) with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=False) as mock_can_evaluate, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value) as mock_evaluate: with mlflow.start_run(): with pytest.raises( ValueError, match= "The model could not be evaluated by any of the registered evaluators", ): evaluate( multiclass_logistic_regressor_model_uri, data=iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="test_evaluator1", evaluator_config=evaluator1_config, ) mock_can_evaluate.assert_called_once_with( model_type="classifier", evaluator_config=evaluator1_config) mock_evaluate.assert_not_called() with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=True) as mock_can_evaluate, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value) as mock_evaluate: classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) with mlflow.start_run() as run: eval1_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="test_evaluator1", evaluator_config=evaluator1_config, ) assert eval1_result.metrics == evaluator1_return_value.metrics assert eval1_result.artifacts == evaluator1_return_value.artifacts mock_can_evaluate.assert_called_once_with( model_type="classifier", evaluator_config=evaluator1_config) mock_evaluate.assert_called_once_with( model=classifier_model, model_type="classifier", dataset=iris_dataset, run_id=run.info.run_id, evaluator_config=evaluator1_config, )
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): y_true = iris_dataset.labels_data classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, } expected_saved_metrics = { "accuracy_score_on_iris_dataset": expected_accuracy_score, } expected_artifact = confusion_matrix(y_true, y_pred) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="dummy_evaluator", ) artifact_name = "confusion_matrix_on_iris_dataset.csv" saved_artifact_path = get_local_artifact_path(run.info.run_id, artifact_name) _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert saved_artifacts == [artifact_name] assert eval_result.metrics == expected_metrics confusion_matrix_artifact = eval_result.artifacts[artifact_name] assert np.array_equal(confusion_matrix_artifact.content, expected_artifact) assert confusion_matrix_artifact.uri == get_artifact_uri( run.info.run_id, artifact_name) assert np.array_equal(confusion_matrix_artifact._load(saved_artifact_path), expected_artifact) with TempDir() as temp_dir: temp_dir_path = temp_dir.path() eval_result.save(temp_dir_path) with open(temp_dir.path("metrics.json"), "r") as fp: assert json.load(fp) == eval_result.metrics with open(temp_dir.path("artifacts_metadata.json"), "r") as fp: assert json.load(fp) == { "confusion_matrix_on_iris_dataset.csv": { "uri": confusion_matrix_artifact.uri, "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", } } assert os.listdir(temp_dir.path("artifacts")) == [ "confusion_matrix_on_iris_dataset.csv" ] loaded_eval_result = EvaluationResult.load(temp_dir_path) assert loaded_eval_result.metrics == eval_result.metrics loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[ artifact_name] assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri assert np.array_equal( confusion_matrix_artifact.content, loaded_confusion_matrix_artifact.content, ) new_confusion_matrix_artifact = Array2DEvaluationArtifact( uri=confusion_matrix_artifact.uri) new_confusion_matrix_artifact._load() assert np.array_equal( confusion_matrix_artifact.content, new_confusion_matrix_artifact.content, )
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): y_true = iris_dataset.labels_data classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, } expected_saved_metrics = { "accuracy_score_on_iris_dataset": expected_accuracy_score, } expected_csv_artifact = confusion_matrix(y_true, y_pred) cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions( y_true, y_pred).figure_ img_buf = io.BytesIO() cm_figure.savefig(img_buf) img_buf.seek(0) expected_image_artifact = Image.open(img_buf) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="dummy_evaluator", ) csv_artifact_name = "confusion_matrix_on_iris_dataset" saved_csv_artifact_path = get_local_artifact_path( run.info.run_id, csv_artifact_name + ".csv") png_artifact_name = "confusion_matrix_image_on_iris_dataset" saved_png_artifact_path = get_local_artifact_path( run.info.run_id, png_artifact_name) + ".png" _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert set(saved_artifacts) == { csv_artifact_name + ".csv", png_artifact_name + ".png" } assert eval_result.metrics == expected_metrics confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name] assert np.array_equal(confusion_matrix_artifact.content, expected_csv_artifact) assert confusion_matrix_artifact.uri == get_artifact_uri( run.info.run_id, csv_artifact_name + ".csv") assert np.array_equal( confusion_matrix_artifact._load(saved_csv_artifact_path), expected_csv_artifact) confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name] assert (ImageChops.difference(confusion_matrix_image_artifact.content, expected_image_artifact).getbbox() is None) assert confusion_matrix_image_artifact.uri == get_artifact_uri( run.info.run_id, png_artifact_name + ".png") assert (ImageChops.difference( confusion_matrix_image_artifact._load(saved_png_artifact_path), expected_image_artifact, ).getbbox() is None) with TempDir() as temp_dir: temp_dir_path = temp_dir.path() eval_result.save(temp_dir_path) with open(temp_dir.path("metrics.json"), "r") as fp: assert json.load(fp) == eval_result.metrics with open(temp_dir.path("artifacts_metadata.json"), "r") as fp: json_dict = json.load(fp) assert "confusion_matrix_on_iris_dataset" in json_dict assert json_dict["confusion_matrix_on_iris_dataset"] == { "uri": confusion_matrix_artifact.uri, "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", } assert "confusion_matrix_image_on_iris_dataset" in json_dict assert json_dict["confusion_matrix_image_on_iris_dataset"] == { "uri": confusion_matrix_image_artifact.uri, "class_name": "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact", } assert set(os.listdir(temp_dir.path("artifacts"))) == { "confusion_matrix_on_iris_dataset.csv", "confusion_matrix_image_on_iris_dataset.png", } loaded_eval_result = EvaluationResult.load(temp_dir_path) assert loaded_eval_result.metrics == eval_result.metrics loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[ csv_artifact_name] assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri assert np.array_equal( confusion_matrix_artifact.content, loaded_confusion_matrix_artifact.content, ) loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[ png_artifact_name] assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri assert (ImageChops.difference( confusion_matrix_image_artifact.content, loaded_confusion_matrix_image_artifact.content, ).getbbox() is None) new_confusion_matrix_artifact = Array2DEvaluationArtifact( uri=confusion_matrix_artifact.uri) new_confusion_matrix_artifact._load() assert np.array_equal( confusion_matrix_artifact.content, new_confusion_matrix_artifact.content, ) new_confusion_matrix_image_artifact = ImageEvaluationArtifact( uri=confusion_matrix_image_artifact.uri) new_confusion_matrix_image_artifact._load() assert np.array_equal( confusion_matrix_image_artifact.content, new_confusion_matrix_image_artifact.content, )