def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data,
                                                 prefix):
    """Check if MlflowMetricsDataSet can be saved in catalog when filepath is given,
    and if logged in mlflow.
    """
    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix)

    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        mlflow_metrics_dataset.save(data)

        # Check if metrics where logged corectly in MLflow.
        assert_are_metrics_logged(data, mlflow_client, run_id, prefix)

    # Check if metrics are stored in catalog.
    catalog_metrics = MlflowMetricsDataSet(
        prefix=prefix,
        # Run id needs to be provided as there is no active run.
        run_id=run_id,
    ).load()

    assert len(catalog_metrics) == len(data)
    for k in catalog_metrics.keys():
        data_key = k.split(".")[-1] if prefix is not None else k
        assert data[data_key] == catalog_metrics[k]
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog(
        {
            "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
            "params:unused_param": MemoryDataSet("blah"),
            "data": MemoryDataSet(),
            "model": PickleDataSet((tmp_path / "model.csv").as_posix()),
            "my_metrics": MlflowMetricsDataSet(),
            "another_metrics": MlflowMetricsDataSet(prefix="foo"),
        }
    )
    return dummy_catalog
Пример #3
0
    def after_catalog_created(
        self,
        catalog: DataCatalog,
        conf_catalog: Dict[str, Any],
        conf_creds: Dict[str, Any],
        feed_dict: Dict[str, Any],
        save_version: str,
        load_versions: str,
    ):
        # we use this hooks to modif "MlflowmetricsDataset" to ensure consistency
        # of the metric name with the catalog name
        for name, dataset in catalog._data_sets.items():

            if isinstance(dataset,
                          MlflowMetricsDataSet) and dataset._prefix is None:
                if dataset._run_id is not None:
                    catalog._data_sets[name] = MlflowMetricsDataSet(
                        run_id=dataset._run_id, prefix=name)
                else:
                    catalog._data_sets[name] = MlflowMetricsDataSet(
                        prefix=name)

            if isinstance(dataset,
                          MlflowMetricDataSet) and dataset.key is None:
                if dataset._run_id is not None:
                    catalog._data_sets[name] = MlflowMetricDataSet(
                        run_id=dataset._run_id,
                        key=name,
                        load_args=dataset._load_args,
                        save_args=dataset._save_args,
                    )
                else:
                    catalog._data_sets[name] = MlflowMetricDataSet(
                        key=name,
                        load_args=dataset._load_args,
                        save_args=dataset._save_args,
                    )

            if isinstance(dataset,
                          MlflowMetricHistoryDataSet) and dataset.key is None:
                if dataset._run_id is not None:
                    catalog._data_sets[name] = MlflowMetricHistoryDataSet(
                        run_id=dataset._run_id,
                        key=name,
                        load_args=dataset._load_args,
                        save_args=dataset._save_args,
                    )
                else:
                    catalog._data_sets[name] = MlflowMetricHistoryDataSet(
                        key=name,
                        load_args=dataset._load_args,
                        save_args=dataset._save_args,
                    )
Пример #4
0
 def after_catalog_created(
     self,
     catalog: DataCatalog,
     conf_catalog: Dict[str, Any],
     conf_creds: Dict[str, Any],
     feed_dict: Dict[str, Any],
     save_version: str,
     load_versions: str,
     run_id: str,
 ):
     for name, dataset in catalog._data_sets.items():
         if isinstance(dataset, MlflowMetricsDataSet) and dataset._prefix is None:
             if dataset._run_id is not None:
                 catalog._data_sets[name] = MlflowMetricsDataSet(
                     run_id=dataset._run_id, prefix=name
                 )
             else:
                 catalog._data_sets[name] = MlflowMetricsDataSet(prefix=name)
def test_mlflow_metrics_dataset_exists(tmp_path, tracking_uri, metrics3):
    """Check if MlflowMetricsDataSet is well identified as
    existing if it has already been saved.
    """
    prefix = "test_metric"

    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix)

    # a mlflow run_id is automatically created
    mlflow_metrics_dataset.save(metrics3)
    assert mlflow_metrics_dataset.exists()
def test_mlflow_metrics_dataset_fails_with_invalid_metric(
        tmp_path, tracking_uri, metrics3):
    """Check if MlflowMetricsDataSet is well identified as
    not existingif it has never been saved.
    """

    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric")

    with pytest.raises(DataSetError,
                       match="Unexpected metric value. Should be of type"):
        mlflow_metrics_dataset.save({
            "metric1": 1
        })  # key: value is not valid, you must specify {key: {value, step}}
def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri,
                                               metrics3):
    """Check if MlflowMetricsDataSet is well identified as
    not existingif it has never been saved.
    """

    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow.start_run(
    )  # starts a run toenable mlflow_metrics_dataset to know where to seacrh
    run_id = mlflow.active_run().info.run_id
    mlflow.end_run()
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric",
                                                  run_id=run_id)
    # a mlflow run_id is automatically created
    assert not mlflow_metrics_dataset.exists()
def test_mlflow_metrics_dataset_saved_without_run_id(tmp_path, tracking_uri,
                                                     metrics3):
    """Check if MlflowMetricsDataSet can be saved in catalog when filepath is given,
    and if logged in mlflow.
    """
    prefix = "test_metric"

    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix)

    # a mlflow run_id is automatically created
    mlflow_metrics_dataset.save(metrics3)
    run_id = mlflow.active_run().info.run_id

    assert_are_metrics_logged(metrics3, mlflow_client, run_id, prefix)
def test_mlflow_metrics_logging_deactivation(tracking_uri, metrics):
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="hello")

    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())

    mlflow_metrics_dataset._logging_activated = False

    all_runs_id_beginning = set([
        run.run_id for k in range(len(mlflow_client.list_experiments()))
        for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
    ])

    mlflow_metrics_dataset.save(metrics)

    all_runs_id_end = set([
        run.run_id for k in range(len(mlflow_client.list_experiments()))
        for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
    ])

    assert all_runs_id_beginning == all_runs_id_end
def test_mlflow_metrics_logging_deactivation_is_bool():
    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="hello")

    with pytest.raises(ValueError,
                       match="_logging_activated must be a boolean"):
        mlflow_metrics_dataset._logging_activated = "hello"
Пример #11
0
def test_mlflow_hook_metrics_dataset_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
                "my_metric": MlflowMetricDataSet(run_id=existing_run_id),
                "another_metric": MlflowMetricDataSet(
                    run_id=existing_run_id, key="foo"
                ),
                "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id),
                "another_metric_history": MlflowMetricHistoryDataSet(
                    run_id=existing_run_id, key="bar"
                ),
            }
        )

        mlflow_hook = MlflowHook()
        runner = SequentialRunner()

        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager)

        current_run_id = mlflow.active_run().info.run_id

        mlflow_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}

        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1
        assert run_data.metrics["my_metric"] == 1.1
        assert run_data.metrics["foo"] == 1.1
        assert (
            run_data.metrics["my_metric_history"] == 0.2
        )  # the list is stored, but only the last value is retrieved
        assert (
            run_data.metrics["bar"] == 0.2
        )  # the list is stored, but only the last value is retrieved
def test_mlflow_pipeline_hook_metrics_with_run_id(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    dummy_pipeline_ml,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)

    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

    with mlflow.start_run():
        existing_run_id = mlflow.active_run().info.run_id

    dummy_catalog_with_run_id = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
        "my_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id),
        "another_metrics":
        MlflowMetricsDataSet(run_id=existing_run_id, prefix="foo"),
    })

    pipeline_hook = MlflowPipelineHook()

    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog_with_run_id,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )
    runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

    current_run_id = mlflow.active_run().info.run_id

    pipeline_hook.after_pipeline_run(
        run_params=dummy_run_params,
        pipeline=dummy_pipeline_ml,
        catalog=dummy_catalog_with_run_id,
    )

    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    all_runs_id = set([
        run.run_id for run in mlflow_client.list_run_infos(experiment_id="0")
    ])

    # the metrics are supposed to have been logged inside existing_run_id
    run_data = mlflow_client.get_run(existing_run_id).data

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert all_runs_id == {current_run_id, existing_run_id}
    assert run_data.metrics["my_metrics.metric_key"] == 1.1
    assert run_data.metrics["foo.metric_key"] == 1.1
Пример #13
0
def test_mlflow_pipeline_hook_metrics_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_run_params
):

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        package_name=project_metadata.package_name,
        project_path=kedro_project_with_mlflow_conf,
    ):

        mlflow_conf = get_mlflow_config()
        mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
            }
        )

        pipeline_hook = MlflowPipelineHook()

        runner = SequentialRunner()
        pipeline_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
            run_id=dummy_run_params["run_id"],
        )
        pipeline_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline_ml, dummy_catalog_with_run_id)

        current_run_id = mlflow.active_run().info.run_id

        pipeline_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline_ml,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}
        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1