示例#1
0
    def evaluate(self,
                 model,
                 model_type,
                 dataset,
                 run_id,
                 evaluator_config=None,
                 **kwargs) -> EvaluationResult:
        client = mlflow.tracking.MlflowClient()
        X, y = dataset._extract_features_and_labels()
        y_pred = model.predict(X)
        if model_type == "classifier":
            accuracy_score = sk_metrics.accuracy_score(y, y_pred)

            metrics = EvaluationMetrics(accuracy_score=accuracy_score)
            self._log_metrics(run_id, metrics, dataset.name)
            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
            confusion_matrix_artifact = Array2DEvaluationArtifact(
                uri=get_artifact_uri(run_id, confusion_matrix_artifact_name),
                content=confusion_matrix,
            )
            confusion_matrix_csv_buff = io.StringIO()
            confusion_matrix_artifact.save(confusion_matrix_csv_buff)
            client.log_text(run_id, confusion_matrix_csv_buff.getvalue(),
                            confusion_matrix_artifact_name)
            artifacts = {
                confusion_matrix_artifact_name: confusion_matrix_artifact
            }
        elif model_type == "regressor":
            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
            metrics = EvaluationMetrics(
                mean_absolute_error=mean_absolute_error,
                mean_squared_error=mean_squared_error)
            self._log_metrics(run_id, metrics, dataset.name)
            artifacts = {}
        else:
            raise ValueError(f"Unsupported model type {model_type}")

        return EvaluationResult(metrics=metrics, artifacts=artifacts)
示例#2
0
    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config,
                 **kwargs) -> EvaluationResult:
        client = mlflow.tracking.MlflowClient()
        X = dataset.features_data
        y = dataset.labels_data
        y_pred = model.predict(X)
        if model_type == "classifier":
            accuracy_score = sk_metrics.accuracy_score(y, y_pred)

            metrics = {"accuracy_score": accuracy_score}
            self._log_metrics(run_id, metrics, dataset.name)
            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}"
            confusion_matrix_artifact = Array2DEvaluationArtifact(
                uri=get_artifact_uri(run_id,
                                     confusion_matrix_artifact_name + ".csv"),
                content=confusion_matrix,
            )
            confusion_matrix_csv_buff = io.StringIO()
            confusion_matrix_artifact._save(confusion_matrix_csv_buff)
            client.log_text(
                run_id,
                confusion_matrix_csv_buff.getvalue(),
                confusion_matrix_artifact_name + ".csv",
            )

            confusion_matrix_figure = sk_metrics.ConfusionMatrixDisplay.from_predictions(
                y, y_pred).figure_
            img_buf = io.BytesIO()
            confusion_matrix_figure.savefig(img_buf)
            img_buf.seek(0)
            confusion_matrix_image = Image.open(img_buf)

            confusion_matrix_image_artifact_name = f"confusion_matrix_image_on_{dataset.name}"
            confusion_matrix_image_artifact = ImageEvaluationArtifact(
                uri=get_artifact_uri(
                    run_id, confusion_matrix_image_artifact_name + ".png"),
                content=confusion_matrix_image,
            )
            confusion_matrix_image_artifact._save(
                confusion_matrix_image_artifact_name + ".png")
            client.log_image(run_id, confusion_matrix_image,
                             confusion_matrix_image_artifact_name + ".png")

            artifacts = {
                confusion_matrix_artifact_name:
                confusion_matrix_artifact,
                confusion_matrix_image_artifact_name:
                confusion_matrix_image_artifact,
            }
        elif model_type == "regressor":
            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
            metrics = {
                "mean_absolute_error": mean_absolute_error,
                "mean_squared_error": mean_squared_error,
            }
            self._log_metrics(run_id, metrics, dataset.name)
            artifacts = {}
        else:
            raise ValueError(f"Unsupported model type {model_type}")

        return EvaluationResult(metrics=metrics, artifacts=artifacts)
示例#3
0
def test_evaluate_with_multi_evaluators(
        multiclass_logistic_regressor_model_uri, iris_dataset):
    with mock.patch.object(
            _model_evaluation_registry,
            "_registry",
        {
            "test_evaluator1": FakeEvauator1,
            "test_evaluator2": FakeEvauator2
        },
    ):
        evaluator1_config = {"eval1_confg": 3}
        evaluator2_config = {"eval2_confg": 4}
        evaluator1_return_value = EvaluationResult(
            metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")})
        evaluator2_return_value = EvaluationResult(
            metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")})

        # evaluators = None is the case evaluators unspecified, it should fetch all registered
        # evaluators, and the evaluation results should equal to the case of
        # evaluators=["test_evaluator1", "test_evaluator2"]
        for evaluators in [None, ["test_evaluator1", "test_evaluator2"]]:
            with mock.patch.object(
                    FakeEvauator1, "can_evaluate", return_value=True
            ) as mock_can_evaluate1, mock.patch.object(
                    FakeEvauator1,
                    "evaluate",
                    return_value=evaluator1_return_value
            ) as mock_evaluate1, mock.patch.object(
                    FakeEvauator2, "can_evaluate", return_value=True
            ) as mock_can_evaluate2, mock.patch.object(
                    FakeEvauator2,
                    "evaluate",
                    return_value=evaluator2_return_value) as mock_evaluate2:
                classifier_model = mlflow.pyfunc.load_model(
                    multiclass_logistic_regressor_model_uri)
                with mlflow.start_run() as run:
                    eval_result = evaluate(
                        classifier_model,
                        iris_dataset._constructor_args["data"],
                        model_type="classifier",
                        targets=iris_dataset._constructor_args["targets"],
                        dataset_name=iris_dataset.name,
                        evaluators=evaluators,
                        evaluator_config={
                            "test_evaluator1": evaluator1_config,
                            "test_evaluator2": evaluator2_config,
                        },
                    )
                    assert eval_result.metrics == {
                        **evaluator1_return_value.metrics,
                        **evaluator2_return_value.metrics,
                    }
                    assert eval_result.artifacts == {
                        **evaluator1_return_value.artifacts,
                        **evaluator2_return_value.artifacts,
                    }
                    mock_can_evaluate1.assert_called_once_with(
                        model_type="classifier",
                        evaluator_config=evaluator1_config)
                    mock_evaluate1.assert_called_once_with(
                        model=classifier_model,
                        model_type="classifier",
                        dataset=iris_dataset,
                        run_id=run.info.run_id,
                        evaluator_config=evaluator1_config,
                    )
                    mock_can_evaluate2.assert_called_once_with(
                        model_type="classifier",
                        evaluator_config=evaluator2_config,
                    )
                    mock_evaluate2.assert_called_once_with(
                        model=classifier_model,
                        model_type="classifier",
                        dataset=iris_dataset,
                        run_id=run.info.run_id,
                        evaluator_config=evaluator2_config,
                    )
示例#4
0
def test_evaluator_interface(multiclass_logistic_regressor_model_uri,
                             iris_dataset):
    with mock.patch.object(_model_evaluation_registry, "_registry",
                           {"test_evaluator1": FakeEvauator1}):
        evaluator1_config = {"eval1_confg_a": 3, "eval1_confg_b": 4}
        evaluator1_return_value = EvaluationResult(
            metrics={
                "m1": 5,
                "m2": 6
            },
            artifacts={
                "a1": FakeArtifact1(uri="uri1"),
                "a2": FakeArtifact2(uri="uri2")
            },
        )
        with mock.patch.object(
                FakeEvauator1, "can_evaluate",
                return_value=False) as mock_can_evaluate, mock.patch.object(
                    FakeEvauator1,
                    "evaluate",
                    return_value=evaluator1_return_value) as mock_evaluate:
            with mlflow.start_run():
                with pytest.raises(
                        ValueError,
                        match=
                        "The model could not be evaluated by any of the registered evaluators",
                ):
                    evaluate(
                        multiclass_logistic_regressor_model_uri,
                        data=iris_dataset._constructor_args["data"],
                        model_type="classifier",
                        targets=iris_dataset._constructor_args["targets"],
                        dataset_name=iris_dataset.name,
                        evaluators="test_evaluator1",
                        evaluator_config=evaluator1_config,
                    )
                mock_can_evaluate.assert_called_once_with(
                    model_type="classifier",
                    evaluator_config=evaluator1_config)
                mock_evaluate.assert_not_called()
        with mock.patch.object(
                FakeEvauator1, "can_evaluate",
                return_value=True) as mock_can_evaluate, mock.patch.object(
                    FakeEvauator1,
                    "evaluate",
                    return_value=evaluator1_return_value) as mock_evaluate:
            classifier_model = mlflow.pyfunc.load_model(
                multiclass_logistic_regressor_model_uri)
            with mlflow.start_run() as run:
                eval1_result = evaluate(
                    classifier_model,
                    iris_dataset._constructor_args["data"],
                    model_type="classifier",
                    targets=iris_dataset._constructor_args["targets"],
                    dataset_name=iris_dataset.name,
                    evaluators="test_evaluator1",
                    evaluator_config=evaluator1_config,
                )
                assert eval1_result.metrics == evaluator1_return_value.metrics
                assert eval1_result.artifacts == evaluator1_return_value.artifacts

                mock_can_evaluate.assert_called_once_with(
                    model_type="classifier",
                    evaluator_config=evaluator1_config)
                mock_evaluate.assert_called_once_with(
                    model=classifier_model,
                    model_type="classifier",
                    dataset=iris_dataset,
                    run_id=run.info.run_id,
                    evaluator_config=evaluator1_config,
                )
示例#5
0
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri,
                             iris_dataset):
    y_true = iris_dataset.labels_data
    classifier_model = mlflow.pyfunc.load_model(
        multiclass_logistic_regressor_model_uri)
    y_pred = classifier_model.predict(iris_dataset.features_data)
    expected_accuracy_score = accuracy_score(y_true, y_pred)
    expected_metrics = {
        "accuracy_score": expected_accuracy_score,
    }
    expected_saved_metrics = {
        "accuracy_score_on_iris_dataset": expected_accuracy_score,
    }

    expected_artifact = confusion_matrix(y_true, y_pred)

    with mlflow.start_run() as run:
        eval_result = evaluate(
            classifier_model,
            iris_dataset._constructor_args["data"],
            model_type="classifier",
            targets=iris_dataset._constructor_args["targets"],
            dataset_name=iris_dataset.name,
            evaluators="dummy_evaluator",
        )

    artifact_name = "confusion_matrix_on_iris_dataset.csv"
    saved_artifact_path = get_local_artifact_path(run.info.run_id,
                                                  artifact_name)

    _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
    assert saved_metrics == expected_saved_metrics
    assert saved_artifacts == [artifact_name]

    assert eval_result.metrics == expected_metrics
    confusion_matrix_artifact = eval_result.artifacts[artifact_name]
    assert np.array_equal(confusion_matrix_artifact.content, expected_artifact)
    assert confusion_matrix_artifact.uri == get_artifact_uri(
        run.info.run_id, artifact_name)
    assert np.array_equal(confusion_matrix_artifact._load(saved_artifact_path),
                          expected_artifact)

    with TempDir() as temp_dir:
        temp_dir_path = temp_dir.path()
        eval_result.save(temp_dir_path)

        with open(temp_dir.path("metrics.json"), "r") as fp:
            assert json.load(fp) == eval_result.metrics

        with open(temp_dir.path("artifacts_metadata.json"), "r") as fp:
            assert json.load(fp) == {
                "confusion_matrix_on_iris_dataset.csv": {
                    "uri":
                    confusion_matrix_artifact.uri,
                    "class_name":
                    "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact",
                }
            }

        assert os.listdir(temp_dir.path("artifacts")) == [
            "confusion_matrix_on_iris_dataset.csv"
        ]

        loaded_eval_result = EvaluationResult.load(temp_dir_path)
        assert loaded_eval_result.metrics == eval_result.metrics
        loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[
            artifact_name]
        assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri
        assert np.array_equal(
            confusion_matrix_artifact.content,
            loaded_confusion_matrix_artifact.content,
        )

        new_confusion_matrix_artifact = Array2DEvaluationArtifact(
            uri=confusion_matrix_artifact.uri)
        new_confusion_matrix_artifact._load()
        assert np.array_equal(
            confusion_matrix_artifact.content,
            new_confusion_matrix_artifact.content,
        )
示例#6
0
def test_classifier_evaluate(multiclass_logistic_regressor_model_uri,
                             iris_dataset):
    y_true = iris_dataset.labels_data
    classifier_model = mlflow.pyfunc.load_model(
        multiclass_logistic_regressor_model_uri)
    y_pred = classifier_model.predict(iris_dataset.features_data)
    expected_accuracy_score = accuracy_score(y_true, y_pred)
    expected_metrics = {
        "accuracy_score": expected_accuracy_score,
    }
    expected_saved_metrics = {
        "accuracy_score_on_iris_dataset": expected_accuracy_score,
    }

    expected_csv_artifact = confusion_matrix(y_true, y_pred)
    cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions(
        y_true, y_pred).figure_
    img_buf = io.BytesIO()
    cm_figure.savefig(img_buf)
    img_buf.seek(0)
    expected_image_artifact = Image.open(img_buf)

    with mlflow.start_run() as run:
        eval_result = evaluate(
            classifier_model,
            iris_dataset._constructor_args["data"],
            model_type="classifier",
            targets=iris_dataset._constructor_args["targets"],
            dataset_name=iris_dataset.name,
            evaluators="dummy_evaluator",
        )

    csv_artifact_name = "confusion_matrix_on_iris_dataset"
    saved_csv_artifact_path = get_local_artifact_path(
        run.info.run_id, csv_artifact_name + ".csv")

    png_artifact_name = "confusion_matrix_image_on_iris_dataset"
    saved_png_artifact_path = get_local_artifact_path(
        run.info.run_id, png_artifact_name) + ".png"

    _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
    assert saved_metrics == expected_saved_metrics
    assert set(saved_artifacts) == {
        csv_artifact_name + ".csv", png_artifact_name + ".png"
    }

    assert eval_result.metrics == expected_metrics
    confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name]
    assert np.array_equal(confusion_matrix_artifact.content,
                          expected_csv_artifact)
    assert confusion_matrix_artifact.uri == get_artifact_uri(
        run.info.run_id, csv_artifact_name + ".csv")
    assert np.array_equal(
        confusion_matrix_artifact._load(saved_csv_artifact_path),
        expected_csv_artifact)
    confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name]
    assert (ImageChops.difference(confusion_matrix_image_artifact.content,
                                  expected_image_artifact).getbbox() is None)
    assert confusion_matrix_image_artifact.uri == get_artifact_uri(
        run.info.run_id, png_artifact_name + ".png")
    assert (ImageChops.difference(
        confusion_matrix_image_artifact._load(saved_png_artifact_path),
        expected_image_artifact,
    ).getbbox() is None)

    with TempDir() as temp_dir:
        temp_dir_path = temp_dir.path()
        eval_result.save(temp_dir_path)

        with open(temp_dir.path("metrics.json"), "r") as fp:
            assert json.load(fp) == eval_result.metrics

        with open(temp_dir.path("artifacts_metadata.json"), "r") as fp:
            json_dict = json.load(fp)
            assert "confusion_matrix_on_iris_dataset" in json_dict
            assert json_dict["confusion_matrix_on_iris_dataset"] == {
                "uri":
                confusion_matrix_artifact.uri,
                "class_name":
                "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact",
            }

            assert "confusion_matrix_image_on_iris_dataset" in json_dict
            assert json_dict["confusion_matrix_image_on_iris_dataset"] == {
                "uri":
                confusion_matrix_image_artifact.uri,
                "class_name":
                "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact",
            }

        assert set(os.listdir(temp_dir.path("artifacts"))) == {
            "confusion_matrix_on_iris_dataset.csv",
            "confusion_matrix_image_on_iris_dataset.png",
        }

        loaded_eval_result = EvaluationResult.load(temp_dir_path)
        assert loaded_eval_result.metrics == eval_result.metrics
        loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[
            csv_artifact_name]
        assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri
        assert np.array_equal(
            confusion_matrix_artifact.content,
            loaded_confusion_matrix_artifact.content,
        )
        loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[
            png_artifact_name]
        assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri
        assert (ImageChops.difference(
            confusion_matrix_image_artifact.content,
            loaded_confusion_matrix_image_artifact.content,
        ).getbbox() is None)

        new_confusion_matrix_artifact = Array2DEvaluationArtifact(
            uri=confusion_matrix_artifact.uri)
        new_confusion_matrix_artifact._load()
        assert np.array_equal(
            confusion_matrix_artifact.content,
            new_confusion_matrix_artifact.content,
        )
        new_confusion_matrix_image_artifact = ImageEvaluationArtifact(
            uri=confusion_matrix_image_artifact.uri)
        new_confusion_matrix_image_artifact._load()
        assert np.array_equal(
            confusion_matrix_image_artifact.content,
            new_confusion_matrix_image_artifact.content,
        )