def pipeline_ml_with_inputs_artifacts(): full_pipeline = Pipeline([ node( func=remove_stopwords, inputs=dict(data="data", stopwords="stopwords_from_nltk"), outputs="cleaned_data", tags=["training", "inference"], ), node( func=train_fun, inputs="cleaned_data", outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "cleaned_data"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_inputs_artifacts = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", ) return pipeline_ml_with_inputs_artifacts
def test_kedro_pipeline_ml_loading_deepcopiable_catalog(tmp_path, tmp_folder): # create pipelien and catalog. The training will not be triggered def fit_fun(data): pass def predict_fun(model, data): return model.predict(data) training_pipeline = Pipeline( [node(func=fit_fun, inputs="data", outputs="model")]) inference_pipeline = Pipeline([ node(func=predict_fun, inputs=["model", "data"], outputs="predictions"), ]) ml_pipeline = pipeline_ml_factory( training=training_pipeline, inference=inference_pipeline, input_name="data", ) # emulate training by creating the model manually model_dataset = MlflowModelSaverDataSet( filepath=(tmp_path / "model.pkl").resolve().as_posix(), flavor="mlflow.sklearn") data = pd.DataFrame( data=[ [1, 2], [3, 4], ], columns=["a", "b"], ) labels = [4, 6] linreg = LinearRegression() linreg.fit(data, labels) model_dataset.save(linreg) # check that mlflow loading is ok catalog = DataCatalog({"data": MemoryDataSet(), "model": model_dataset}) kedro_model = KedroPipelineModel(pipeline=ml_pipeline, catalog=catalog, input_name=ml_pipeline.input_name) artifacts = kedro_model.extract_pipeline_artifacts(tmp_folder) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model(artifact_path="model", python_model=kedro_model, artifacts=artifacts) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model( model_uri=(Path(r"runs:/") / run_id / "model").as_posix()) loaded_model.predict(data) == [4.0, 6.0]
def test_mlflow_pipeline_hook_with_copy_mode( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_catalog, dummy_run_params, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, conda_env={}, model_name=dummy_pipeline_ml.model_name, copy_mode=copy_mode, ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model.loaded_catalog._data_sets.items() } assert actual_copy_mode == expected
def pipeline_ml_obj(): def preprocess_fun(data): return data def fit_fun(data): return 2 def predict_fun(model, data): return data * model full_pipeline = Pipeline([ node( func=preprocess_fun, inputs="raw_data", outputs="data", tags=["inference", "training"], ), node(func=fit_fun, inputs="data", outputs="model", tags=["training"]), node( func=predict_fun, inputs=["data", "model"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_obj = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", ) return pipeline_ml_obj
def test_mlflow_pipeline_hook_with_copy_mode( mocker, monkeypatch, tmp_path, config_dir, dummy_pipeline_ml, dummy_catalog, dummy_run_params, dummy_mlflow_conf, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, conda_env={}, model_name=dummy_pipeline_ml.model_name, copy_mode=copy_mode, ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model.loaded_catalog. _data_sets.items() } assert actual_copy_mode == expected
def test_wrong_pipeline_ml_signature_type(pipeline_with_tag): with pytest.raises( ValueError, match= "model_signature must be one of 'None', 'auto', or a 'ModelSignature'", ): pipeline_ml_factory( training=pipeline_with_tag, inference=Pipeline([ node( func=predict_fun, inputs=["model", "data"], outputs="predictions", ) ]), input_name="data", model_signature="wrong_type", )
def test_mlflow_hook_save_pipeline_ml_with_artifact_path( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, artifact_path, expected_artifact_path, ): # config_with_base_mlflow_conf is a conftest fixture bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: mlflow_hook = MlflowHook() runner = SequentialRunner() log_model_kwargs = { "conda_env": env_from_dict, } if artifact_path is not None: # we need to test what happens if the key is NOT present log_model_kwargs["artifact_path"] = artifact_path pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", log_model_kwargs=log_model_kwargs, ) context = session.load_context() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model( f"runs:/{run_id}/{expected_artifact_path}") # the real test is that the model is loaded without error assert trained_model is not None
def dummy_pipeline_ml(dummy_pipeline, env_from_dict): dummy_pipeline_ml = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", conda_env=env_from_dict, model_name="model", ) return dummy_pipeline_ml
def pipeline_ml_with_parameters(): def remove_stopwords(data, stopwords): return data def train_fun_hyperparam(data, hyperparam): return 2 def predict_fun(model, data): return data * model def convert_probs_to_pred(data, threshold): return (data > threshold) * 1 full_pipeline = Pipeline([ # almost the same that previsously but stopwords are parameters # this is a shared parameter between inference and training22 node( func=remove_stopwords, inputs=dict(data="data", stopwords="params:stopwords"), outputs="cleaned_data", tags=["training", "inference"], ), # parameters in training pipeline, should not be persisted node( func=train_fun_hyperparam, inputs=["cleaned_data", "params:penalty"], outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "cleaned_data"], outputs="predicted_probs", tags=["inference"], ), # this time, there is a parameter only for the inference pipeline node( func=convert_probs_to_pred, inputs=["predicted_probs", "params:threshold"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_parameters = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", log_model_kwargs={ "conda_env": { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, }, ) return pipeline_ml_with_parameters
def test_too_many_free_inputs(): with pytest.raises(KedroMlflowPipelineMLInputsError, match="No free input is allowed"): pipeline_ml_factory( training=Pipeline([ node( func=preprocess_fun, inputs="raw_data", outputs="neither_data_nor_model", ) ]), inference=Pipeline([ node( func=predict_fun, inputs=["model", "data"], outputs="predictions", ) ]), input_name="data", )
def pipeline_ml_with_tag(pipeline_with_tag): pipeline_ml_with_tag = pipeline_ml_factory( training=pipeline_with_tag, inference=Pipeline([ node(func=predict_fun, inputs=["model", "data"], outputs="predictions") ]), input_name="data", ) return pipeline_ml_with_tag
def dummy_pipeline_ml(dummy_pipeline, env_from_dict): dummy_pipeline_ml = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", log_model_kwargs={ "conda_env": env_from_dict, "artifact_path": "model" }, ) return dummy_pipeline_ml
def test_too_many_inference_outputs(): with pytest.raises( KedroMlflowPipelineMLOutputsError, match="The inference pipeline must have one and only one output", ): pipeline_ml_factory( training=Pipeline( [node( func=train_fun, inputs="data", outputs="model", )]), inference=Pipeline([ node( func=predict_fun_with_metric, inputs=["model", "data"], outputs=["predictions", "metric"], ) ]), input_name="data", )
def test_not_enough_inference_outputs(): with pytest.raises( KedroMlflowPipelineMLOutputsError, match="The inference pipeline must have one and only one output", ): pipeline_ml_factory( training=Pipeline( [node( func=train_fun, inputs="data", outputs="model", )]), inference=Pipeline([ node( func=predict_fun_return_nothing, inputs=["model", "data"], outputs=None, ) ]), input_name="data", )
def test_mlflow_pipeline_hook_with_pipeline_ml_signature( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf) _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project_with_mlflow_conf, ): pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", conda_env=env_from_dict, model_name="model", model_signature=model_signature, ) pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run( run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog ) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
def test_mlflow_pipeline_hook_with_pipeline_ml_signature( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, dummy_mlflow_conf, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", conda_env=env_from_dict, model_name="model", model_signature=model_signature, ) pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
def test_mlflow_hook_save_pipeline_ml_with_signature( kedro_project_with_mlflow_conf, env_from_dict, dummy_pipeline, dummy_catalog, dummy_run_params, model_signature, expected_signature, ): # config_with_base_mlflow_conf is a conftest fixture bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: mlflow_hook = MlflowHook() runner = SequentialRunner() pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline.only_nodes_with_tags("training"), inference=dummy_pipeline.only_nodes_with_tags("inference"), input_name="raw_data", log_model_kwargs={ "conda_env": env_from_dict, "signature": model_signature, }, ) context = session.load_context() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature == expected_signature
def pipeline_ml_with_intermediary_artifacts(): full_pipeline = Pipeline([ node( func=preprocess_fun, inputs="raw_data", outputs="data", tags=["training"], ), node( func=fit_encoder_fun, inputs="data", outputs="encoder", tags=["training"], ), node( func=apply_encoder_fun, inputs=["encoder", "data"], outputs="encoded_data", tags=["training", "inference"], ), node( func=train_fun, inputs="encoded_data", outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "encoded_data"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_intermediary_artifacts = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", ) return pipeline_ml_with_intermediary_artifacts
def register_pipelines(self) -> Dict[str, Pipeline]: """Register the project's pipeline. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ etl_pipeline = create_etl_pipeline() etl_instances_pipeline = etl_pipeline.only_nodes_with_tags("etl_instances") etl_labels_pipeline = etl_pipeline.only_nodes_with_tags("etl_labels") ml_pipeline = create_ml_pipeline() inference_pipeline = ml_pipeline.only_nodes_with_tags("inference") training_pipeline_ml = pipeline_ml_factory( training=ml_pipeline.only_nodes_with_tags("training"), inference=inference_pipeline, input_name="instances", model_name="kedro_mlflow_tutorial", conda_env={ "python": 3.7, "pip": [f"kedro_mlflow_tutorial=={PROJECT_VERSION}"], }, model_signature="auto", ) user_app_pipeline = create_user_app_pipeline() return { "etl_instances": etl_instances_pipeline, "etl_labels": etl_labels_pipeline, "training": training_pipeline_ml, "inference": inference_pipeline, "user_app": user_app_pipeline, "__default__": etl_instances_pipeline + etl_labels_pipeline + inference_pipeline + user_app_pipeline, }
def pipeline_ml_with_parameters(): full_pipeline = Pipeline([ # almost the same that previsously but stopwords are parameters # this is a shared parameter between inference and training22 node( func=remove_stopwords, inputs=dict(data="data", stopwords="params:stopwords"), outputs="cleaned_data", tags=["training", "inference"], ), # parameters in training pipeline, should not be persisted node( func=train_fun_hyperparam, inputs=["cleaned_data", "params:penalty"], outputs="model", tags=["training"], ), node( func=predict_fun, inputs=["model", "cleaned_data"], outputs="predicted_probs", tags=["inference"], ), # this time, there is a parameter only for the inference pipeline node( func=convert_probs_to_pred, inputs=["predicted_probs", "params:threshold"], outputs="predictions", tags=["inference"], ), ]) pipeline_ml_with_parameters = pipeline_ml_factory( training=full_pipeline.only_nodes_with_tags("training"), inference=full_pipeline.only_nodes_with_tags("inference"), input_name="data", ) return pipeline_ml_with_parameters
def test_mlflow_hook_save_pipeline_ml_with_copy_mode( kedro_project_with_mlflow_conf, dummy_pipeline_ml, dummy_catalog, dummy_run_params, copy_mode, expected, ): # config_with_base_mlflow_conf is a conftest fixture bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() mlflow_hook = MlflowHook() runner = SequentialRunner() mlflow_hook.after_context_created(context) mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) pipeline_to_run = pipeline_ml_factory( training=dummy_pipeline_ml.training, inference=dummy_pipeline_ml.inference, input_name=dummy_pipeline_ml.input_name, log_model_kwargs={ "artifact_path": dummy_pipeline_ml.log_model_kwargs["artifact_path"], "conda_env": { "python": "3.7.0", "dependencies": ["kedro==0.16.5"] }, }, kpm_kwargs={ "copy_mode": copy_mode, }, ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) mlflow_tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) loaded_model = mlflow.pyfunc.load_model( model_uri=f"runs:/{run_id}/model") actual_copy_mode = { name: ds._copy_mode for name, ds in loaded_model._model_impl.python_model. loaded_catalog._data_sets.items() } assert actual_copy_mode == expected