def evaluate(self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs) -> EvaluationResult: client = mlflow.tracking.MlflowClient() X, y = dataset._extract_features_and_labels() y_pred = model.predict(X) if model_type == "classifier": accuracy_score = sk_metrics.accuracy_score(y, y_pred) metrics = EvaluationMetrics(accuracy_score=accuracy_score) self._log_metrics(run_id, metrics, dataset.name) confusion_matrix = sk_metrics.confusion_matrix(y, y_pred) confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv" confusion_matrix_artifact = Array2DEvaluationArtifact( uri=get_artifact_uri(run_id, confusion_matrix_artifact_name), content=confusion_matrix, ) confusion_matrix_csv_buff = io.StringIO() confusion_matrix_artifact.save(confusion_matrix_csv_buff) client.log_text(run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name) artifacts = { confusion_matrix_artifact_name: confusion_matrix_artifact } elif model_type == "regressor": mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred) mean_squared_error = sk_metrics.mean_squared_error(y, y_pred) metrics = EvaluationMetrics( mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error) self._log_metrics(run_id, metrics, dataset.name) artifacts = {} else: raise ValueError(f"Unsupported model type {model_type}") return EvaluationResult(metrics=metrics, artifacts=artifacts)
def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs) -> EvaluationResult: client = mlflow.tracking.MlflowClient() X = dataset.features_data y = dataset.labels_data y_pred = model.predict(X) if model_type == "classifier": accuracy_score = sk_metrics.accuracy_score(y, y_pred) metrics = {"accuracy_score": accuracy_score} self._log_metrics(run_id, metrics, dataset.name) confusion_matrix = sk_metrics.confusion_matrix(y, y_pred) confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}" confusion_matrix_artifact = Array2DEvaluationArtifact( uri=get_artifact_uri(run_id, confusion_matrix_artifact_name + ".csv"), content=confusion_matrix, ) confusion_matrix_csv_buff = io.StringIO() confusion_matrix_artifact._save(confusion_matrix_csv_buff) client.log_text( run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name + ".csv", ) confusion_matrix_figure = sk_metrics.ConfusionMatrixDisplay.from_predictions( y, y_pred).figure_ img_buf = io.BytesIO() confusion_matrix_figure.savefig(img_buf) img_buf.seek(0) confusion_matrix_image = Image.open(img_buf) confusion_matrix_image_artifact_name = f"confusion_matrix_image_on_{dataset.name}" confusion_matrix_image_artifact = ImageEvaluationArtifact( uri=get_artifact_uri( run_id, confusion_matrix_image_artifact_name + ".png"), content=confusion_matrix_image, ) confusion_matrix_image_artifact._save( confusion_matrix_image_artifact_name + ".png") client.log_image(run_id, confusion_matrix_image, confusion_matrix_image_artifact_name + ".png") artifacts = { confusion_matrix_artifact_name: confusion_matrix_artifact, confusion_matrix_image_artifact_name: confusion_matrix_image_artifact, } elif model_type == "regressor": mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred) mean_squared_error = sk_metrics.mean_squared_error(y, y_pred) metrics = { "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, } self._log_metrics(run_id, metrics, dataset.name) artifacts = {} else: raise ValueError(f"Unsupported model type {model_type}") return EvaluationResult(metrics=metrics, artifacts=artifacts)
def test_evaluate_with_multi_evaluators( multiclass_logistic_regressor_model_uri, iris_dataset): with mock.patch.object( _model_evaluation_registry, "_registry", { "test_evaluator1": FakeEvauator1, "test_evaluator2": FakeEvauator2 }, ): evaluator1_config = {"eval1_confg": 3} evaluator2_config = {"eval2_confg": 4} evaluator1_return_value = EvaluationResult( metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")}) evaluator2_return_value = EvaluationResult( metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")}) # evaluators = None is the case evaluators unspecified, it should fetch all registered # evaluators, and the evaluation results should equal to the case of # evaluators=["test_evaluator1", "test_evaluator2"] for evaluators in [None, ["test_evaluator1", "test_evaluator2"]]: with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=True ) as mock_can_evaluate1, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value ) as mock_evaluate1, mock.patch.object( FakeEvauator2, "can_evaluate", return_value=True ) as mock_can_evaluate2, mock.patch.object( FakeEvauator2, "evaluate", return_value=evaluator2_return_value) as mock_evaluate2: classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators=evaluators, evaluator_config={ "test_evaluator1": evaluator1_config, "test_evaluator2": evaluator2_config, }, ) assert eval_result.metrics == { **evaluator1_return_value.metrics, **evaluator2_return_value.metrics, } assert eval_result.artifacts == { **evaluator1_return_value.artifacts, **evaluator2_return_value.artifacts, } mock_can_evaluate1.assert_called_once_with( model_type="classifier", evaluator_config=evaluator1_config) mock_evaluate1.assert_called_once_with( model=classifier_model, model_type="classifier", dataset=iris_dataset, run_id=run.info.run_id, evaluator_config=evaluator1_config, ) mock_can_evaluate2.assert_called_once_with( model_type="classifier", evaluator_config=evaluator2_config, ) mock_evaluate2.assert_called_once_with( model=classifier_model, model_type="classifier", dataset=iris_dataset, run_id=run.info.run_id, evaluator_config=evaluator2_config, )
def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_dataset): with mock.patch.object(_model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvauator1}): evaluator1_config = {"eval1_confg_a": 3, "eval1_confg_b": 4} evaluator1_return_value = EvaluationResult( metrics={ "m1": 5, "m2": 6 }, artifacts={ "a1": FakeArtifact1(uri="uri1"), "a2": FakeArtifact2(uri="uri2") }, ) with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=False) as mock_can_evaluate, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value) as mock_evaluate: with mlflow.start_run(): with pytest.raises( ValueError, match= "The model could not be evaluated by any of the registered evaluators", ): evaluate( multiclass_logistic_regressor_model_uri, data=iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="test_evaluator1", evaluator_config=evaluator1_config, ) mock_can_evaluate.assert_called_once_with( model_type="classifier", evaluator_config=evaluator1_config) mock_evaluate.assert_not_called() with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=True) as mock_can_evaluate, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value) as mock_evaluate: classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) with mlflow.start_run() as run: eval1_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="test_evaluator1", evaluator_config=evaluator1_config, ) assert eval1_result.metrics == evaluator1_return_value.metrics assert eval1_result.artifacts == evaluator1_return_value.artifacts mock_can_evaluate.assert_called_once_with( model_type="classifier", evaluator_config=evaluator1_config) mock_evaluate.assert_called_once_with( model=classifier_model, model_type="classifier", dataset=iris_dataset, run_id=run.info.run_id, evaluator_config=evaluator1_config, )
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): y_true = iris_dataset.labels_data classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, } expected_saved_metrics = { "accuracy_score_on_iris_dataset": expected_accuracy_score, } expected_artifact = confusion_matrix(y_true, y_pred) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="dummy_evaluator", ) artifact_name = "confusion_matrix_on_iris_dataset.csv" saved_artifact_path = get_local_artifact_path(run.info.run_id, artifact_name) _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert saved_artifacts == [artifact_name] assert eval_result.metrics == expected_metrics confusion_matrix_artifact = eval_result.artifacts[artifact_name] assert np.array_equal(confusion_matrix_artifact.content, expected_artifact) assert confusion_matrix_artifact.uri == get_artifact_uri( run.info.run_id, artifact_name) assert np.array_equal(confusion_matrix_artifact._load(saved_artifact_path), expected_artifact) with TempDir() as temp_dir: temp_dir_path = temp_dir.path() eval_result.save(temp_dir_path) with open(temp_dir.path("metrics.json"), "r") as fp: assert json.load(fp) == eval_result.metrics with open(temp_dir.path("artifacts_metadata.json"), "r") as fp: assert json.load(fp) == { "confusion_matrix_on_iris_dataset.csv": { "uri": confusion_matrix_artifact.uri, "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", } } assert os.listdir(temp_dir.path("artifacts")) == [ "confusion_matrix_on_iris_dataset.csv" ] loaded_eval_result = EvaluationResult.load(temp_dir_path) assert loaded_eval_result.metrics == eval_result.metrics loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[ artifact_name] assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri assert np.array_equal( confusion_matrix_artifact.content, loaded_confusion_matrix_artifact.content, ) new_confusion_matrix_artifact = Array2DEvaluationArtifact( uri=confusion_matrix_artifact.uri) new_confusion_matrix_artifact._load() assert np.array_equal( confusion_matrix_artifact.content, new_confusion_matrix_artifact.content, )
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): y_true = iris_dataset.labels_data classifier_model = mlflow.pyfunc.load_model( multiclass_logistic_regressor_model_uri) y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, } expected_saved_metrics = { "accuracy_score_on_iris_dataset": expected_accuracy_score, } expected_csv_artifact = confusion_matrix(y_true, y_pred) cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions( y_true, y_pred).figure_ img_buf = io.BytesIO() cm_figure.savefig(img_buf) img_buf.seek(0) expected_image_artifact = Image.open(img_buf) with mlflow.start_run() as run: eval_result = evaluate( classifier_model, iris_dataset._constructor_args["data"], model_type="classifier", targets=iris_dataset._constructor_args["targets"], dataset_name=iris_dataset.name, evaluators="dummy_evaluator", ) csv_artifact_name = "confusion_matrix_on_iris_dataset" saved_csv_artifact_path = get_local_artifact_path( run.info.run_id, csv_artifact_name + ".csv") png_artifact_name = "confusion_matrix_image_on_iris_dataset" saved_png_artifact_path = get_local_artifact_path( run.info.run_id, png_artifact_name) + ".png" _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) assert saved_metrics == expected_saved_metrics assert set(saved_artifacts) == { csv_artifact_name + ".csv", png_artifact_name + ".png" } assert eval_result.metrics == expected_metrics confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name] assert np.array_equal(confusion_matrix_artifact.content, expected_csv_artifact) assert confusion_matrix_artifact.uri == get_artifact_uri( run.info.run_id, csv_artifact_name + ".csv") assert np.array_equal( confusion_matrix_artifact._load(saved_csv_artifact_path), expected_csv_artifact) confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name] assert (ImageChops.difference(confusion_matrix_image_artifact.content, expected_image_artifact).getbbox() is None) assert confusion_matrix_image_artifact.uri == get_artifact_uri( run.info.run_id, png_artifact_name + ".png") assert (ImageChops.difference( confusion_matrix_image_artifact._load(saved_png_artifact_path), expected_image_artifact, ).getbbox() is None) with TempDir() as temp_dir: temp_dir_path = temp_dir.path() eval_result.save(temp_dir_path) with open(temp_dir.path("metrics.json"), "r") as fp: assert json.load(fp) == eval_result.metrics with open(temp_dir.path("artifacts_metadata.json"), "r") as fp: json_dict = json.load(fp) assert "confusion_matrix_on_iris_dataset" in json_dict assert json_dict["confusion_matrix_on_iris_dataset"] == { "uri": confusion_matrix_artifact.uri, "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", } assert "confusion_matrix_image_on_iris_dataset" in json_dict assert json_dict["confusion_matrix_image_on_iris_dataset"] == { "uri": confusion_matrix_image_artifact.uri, "class_name": "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact", } assert set(os.listdir(temp_dir.path("artifacts"))) == { "confusion_matrix_on_iris_dataset.csv", "confusion_matrix_image_on_iris_dataset.png", } loaded_eval_result = EvaluationResult.load(temp_dir_path) assert loaded_eval_result.metrics == eval_result.metrics loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[ csv_artifact_name] assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri assert np.array_equal( confusion_matrix_artifact.content, loaded_confusion_matrix_artifact.content, ) loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[ png_artifact_name] assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri assert (ImageChops.difference( confusion_matrix_image_artifact.content, loaded_confusion_matrix_image_artifact.content, ).getbbox() is None) new_confusion_matrix_artifact = Array2DEvaluationArtifact( uri=confusion_matrix_artifact.uri) new_confusion_matrix_artifact._load() assert np.array_equal( confusion_matrix_artifact.content, new_confusion_matrix_artifact.content, ) new_confusion_matrix_image_artifact = ImageEvaluationArtifact( uri=confusion_matrix_image_artifact.uri) new_confusion_matrix_image_artifact._load() assert np.array_equal( confusion_matrix_image_artifact.content, new_confusion_matrix_image_artifact.content, )