Exemplo n.º 1
0
def test_pipeline_run_hook_getting_configs(
    kedro_project,
    dummy_run_params,
    dummy_pipeline,
    dummy_catalog,
):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(
            node=dict(flatten_dict_params=True, recursive=False, sep="-")), ),
    ),

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        mlflow_node_hook = MlflowNodeHook()
        mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                             pipeline=dummy_pipeline,
                                             catalog=dummy_catalog)

        assert (
            mlflow_node_hook.flatten,
            mlflow_node_hook.recursive,
            mlflow_node_hook.sep,
        ) == (True, False, "-")
Exemplo n.º 2
0
def test_node_hook(tmp_path):
    mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True,
                                      recursive=True,
                                      sep="-")

    def fake_fun(arg1, arg2, arg3):
        return None

    node_test = node(
        func=fake_fun,
        inputs={
            "arg1": "params:param1",
            "arg2": "foo",
            "arg3": "parameters"
        },
        outputs="out",
    )
    catalog = DataCatalog({
        "params:param1": 1,
        "foo": MemoryDataSet(),
        "bar": MemoryDataSet(),
        "parameters": {
            "param1": 1,
            "param2": 2
        },
    })
    node_inputs = {
        v: catalog._data_sets.get(v)
        for k, v in node_test._inputs.items()
    }

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow_node_hook.before_node_run(
            node=node_test,
            catalog=catalog,
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == {
        "param1": "1",
        "parameters-param1": "1",
        "parameters-param2": "2",
    }
Exemplo n.º 3
0
def test_pipeline_run_hook_getting_configs(tmp_path, config_dir, monkeypatch,
                                           dummy_run_params, dummy_pipeline,
                                           dummy_catalog):

    monkeypatch.chdir(tmp_path)

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(hooks=dict(
            node=dict(flatten_dict_params=True, recursive=False, sep="-")), ),
    ),

    mlflow_node_hook = MlflowNodeHook()
    mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                         pipeline=dummy_pipeline,
                                         catalog=dummy_catalog)

    assert (
        mlflow_node_hook.flatten,
        mlflow_node_hook.recursive,
        mlflow_node_hook.sep,
    ) == (True, False, "-")
Exemplo n.º 4
0
def test_node_hook_logging(
    tmp_path,
    mocker,
    monkeypatch,
    dummy_run_params,
    dummy_catalog,
    dummy_pipeline,
    dummy_node,
    config_dir,
    flatten_dict_params,
    expected,
):

    mocker.patch("logging.config.dictConfig")
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    # config = KedroMlflowConfig(
    #     project_path=tmp_path,
    #     node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"},
    # )
    # # the function is imported inside the other file antd this is the file to patch
    # # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method
    # mocker.patch(
    #     "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config
    # )

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(hooks=dict(node=dict(flatten_dict_params=flatten_dict_params,
                                  recursive=False,
                                  sep="-")), ),
    ),

    mlflow_node_hook = MlflowNodeHook()

    node_inputs = {
        v: dummy_catalog._data_sets.get(v)
        for k, v in dummy_node._inputs.items()
    }

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                             pipeline=dummy_pipeline,
                                             catalog=dummy_catalog)
        mlflow_node_hook.before_node_run(
            node=dummy_node,
            catalog=dummy_catalog,
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == expected
Exemplo n.º 5
0
def test_node_hook_logging_above_limit_tag_strategy(kedro_project,
                                                    dummy_run_params,
                                                    param_length):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="tag")), ),
    )

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=Pipeline([]),
                catalog=DataCatalog(),
            )

            # IMPORTANT: Overpassing the parameters limit
            # should raise an error for all mlflow backend
            # but it does not on FileStore backend :
            # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425
            # Since we use FileStore system for simplicty for tests logging works
            # But we have enforced failure (which is slightly different from mlflow
            # behaviour)
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == {}
        assert {
            k: v
            for k, v in current_run.data.tags.items()
            if not k.startswith("mlflow")
        } == {
            "my_param": param_value
        }
Exemplo n.º 6
0
class ProjectContext(KedroContext):

    project_name = "nyc-taxi"
    # `project_version` is the version of kedro used to generate the project
    project_version = "0.16.4"
    package_name = "nyc-taxi"
    hooks = (
        MlflowNodeHook(flatten_dict_params=False),
        MlflowPipelineHook(
            model_name="nyc-taxi",
            conda_env=
            "/home/dwarszawski/Workspace/personal/dssconf2020/dssconf2020/ml-pipeline/src/requirements.txt",
        ),
    )

    def _get_pipelines(self) -> Dict[str, Pipeline]:
        return create_pipelines()
Exemplo n.º 7
0
class ProjectContext(KedroContext):
    """Users can override the remaining methods from the parent class here,
    or create new ones (e.g. as required by plugins)
    """

    project_name = "{{ cookiecutter.project_name }}"
    # `project_version` is the version of kedro used to generate the project
    project_version = "{{ cookiecutter.kedro_version }}"
    package_name = "{{ cookiecutter.python_package }}"
    hooks = (
        MlflowNodeHook(flatten_dict_params=False),
        MlflowPipelineHook(
            model_name="{{ cookiecutter.python_package }}", conda_env="src/requirements.txt",
        ),
    )

    def _get_pipelines(self) -> Dict[str, Pipeline]:
        return create_pipelines()
Exemplo n.º 8
0
def test_node_hook_logging_above_limit_truncate_strategy(
        kedro_project, dummy_run_params, param_length):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ),
    )

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=Pipeline([]),
                catalog=DataCatalog(),
            )
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == {
            "my_param": param_value[0:MAX_PARAM_VAL_LENGTH]
        }
Exemplo n.º 9
0
def test_node_hook_logging_above_limit_fail_strategy(tmp_path, config_dir,
                                                     dummy_run_params,
                                                     dummy_node, param_length):

    # mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="fail")), ),
    )

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    with mlflow.start_run():
        mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                             pipeline=Pipeline([]),
                                             catalog=DataCatalog())

        # IMPORTANT: Overpassing the parameters limit
        # should raise an error for all mlflow backend
        # but it does not on FileStore backend :
        # https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425
        # Since we use FileStore system for simplicty for tests logging works
        # But we have enforced failure (which is slightly different from mlflow
        # behaviour)
        with pytest.raises(
                ValueError,
                match=f"Parameter 'my_param' length is {param_length}"):
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
Exemplo n.º 10
0
def test_node_hook_logging_above_limit_truncate_strategy(
        tmp_path, config_dir, dummy_run_params, dummy_node, param_length):

    # mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)

    _write_yaml(
        tmp_path / "conf" / "base" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ),
    )

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    with mlflow.start_run():
        mlflow_node_hook.before_pipeline_run(run_params=dummy_run_params,
                                             pipeline=Pipeline([]),
                                             catalog=DataCatalog())
        mlflow_node_hook.before_node_run(
            node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
            catalog=DataCatalog(),  # can be empty
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == {
        "my_param": param_value[0:MAX_PARAM_VAL_LENGTH]
    }
Exemplo n.º 11
0
def mock_settings_with_mlflow_hooks(mocker):

    return _mock_settings_with_hooks(mocker,
                                     hooks=(DummyProjectHooks(),
                                            MlflowPipelineHook(),
                                            MlflowNodeHook()))