def test_model_packaging(tmp_path, pipeline_ml_obj): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) catalog._data_sets["model"].save(2) # emulate model fitting artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=artifacts, conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id loaded_model = mlflow.pyfunc.load_model( model_uri=(Path(r"runs:/") / run_id / "model").as_posix()) assert loaded_model.predict(1) == 2
def run(self): """ Run the workflow : run each config """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node( ) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node, multiple_learning_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline result = runner.run(pipeline, data_catalog) if len(result) == 0: self.terminate()
def _run_one_task(self, config_filename): # create node from Task load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) data_catalog.save('base_directory', self.base_directory) # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_data_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, config, data_catalog
def test_model_packaging_missing_artifacts(tmp_path, pipeline_ml_obj): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj, catalog=catalog) mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow.pyfunc.log_model( artifact_path="model", python_model=kedro_model, artifacts=None, # no artifacts provided conda_env={"python": "3.7.0"}, ) run_id = mlflow.active_run().info.run_id with pytest.raises( ValueError, match="Provided artifacts do not match catalog entries"): mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
def __init__(self, layers): self._data_sets = { "cat": PickleDataSet(filepath=str(tmp_path)), "parameters": MemoryDataSet({"name": "value"}), "params:rabbit": MemoryDataSet("value"), } self.layers = layers
def run(self): """ Run all tasks """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet(), 'dataset': MemoryDataSet(), 'data': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_node = mls.sl.workflows.tasks.LearnTask.get_node() evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node() # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_node, evaluate_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) self.terminate()
def dummy_catalog(): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": CSVDataSet("fake/path/to/model.csv"), }) return dummy_catalog
def catalog_with_encoder(): return DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "encoder": CSVDataSet("fake/path/to/encoder.csv"), "model": CSVDataSet("fake/path/to/model.csv"), })
def test_no_param_datasets_in_respose(self, fake_cli_invoke, fake_load_context, mocker): yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value catalog_data_sets = { "iris_data": CSVDataSet("test.csv"), "parameters": MemoryDataSet(), "params:data_ratio": MemoryDataSet(), "intermediate": MemoryDataSet(), "not_used": CSVDataSet("test2.csv"), } pl_obj_data_sets = catalog_data_sets.keys() - {"not_used"} mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocked_context.pipelines.keys.return_value = (self.PIPELINE_NAME, ) mocked_pl_obj = mocked_context.pipelines.get.return_value mocked_pl_obj.data_sets.return_value = pl_obj_data_sets result = fake_cli_invoke(["catalog", "list"]) assert not result.exit_code # 'parameters' and 'params:data_ratio' should not appear in the response expected_dict = { "DataSets in 'pipeline' pipeline": { "Datasets mentioned in pipeline": { "CSVDataSet": ["iris_data"], "MemoryDataSet": ["intermediate"], }, "Datasets not mentioned in pipeline": { "CSVDataSet": ["not_used"] }, } } yaml_dump_mock.assert_called_once_with(expected_dict)
def set_catalog(context, key1, key2, key3, key4): ds1 = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}) ds2 = pd.DataFrame({"col1": [9, 8], "col2": [7, 6], "col3": [5, 4]}) context.catalog = DataCatalog({ key1: MemoryDataSet(ds1), key2: MemoryDataSet(), key3: MemoryDataSet(ds2), key4: MemoryDataSet(), })
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()), }) return dummy_catalog
def test_catalog_extraction_missing_inference_input(pipeline_ml_with_tag): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet() }) with pytest.raises( KedroMlflowPipelineMLDatasetsError, match="since it is an input for inference pipeline", ): pipeline_ml_with_tag.extract_pipeline_catalog(catalog)
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def catalog_with_stopwords(): catalog_with_stopwords = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "stopwords_from_nltk": CSVDataSet("fake/path/to/stopwords.csv"), "model": CSVDataSet("fake/path/to/model.csv"), }) return catalog_with_stopwords
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(1), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def catalog_with_stopwords(tmp_path): catalog_with_stopwords = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "stopwords_from_nltk": PickleDataSet((tmp_path / "stopwords.pkl").resolve().as_posix()), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), }) return catalog_with_stopwords
def catalog_with_encoder(tmp_path): catalog_with_encoder = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "encoder": PickleDataSet((tmp_path / "encoder.pkl").resolve().as_posix()), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), }) return catalog_with_encoder
def test_catalog_extraction_unpersisted_inference_input(pipeline_ml_with_tag): catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": MemoryDataSet() }) with pytest.raises( KedroMlflowPipelineMLDatasetsError, match= "The datasets of the training pipeline must be persisted locally", ): pipeline_ml_with_tag.extract_pipeline_catalog(catalog)
def dummy_catalog(): catalog = DataCatalog( { "params:param1": 1, "foo": MemoryDataSet(), "bar": MemoryDataSet(), "parameters": {"param1": 1, "param2": 2}, } ) return catalog
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": PickleDataSet(filepath=(tmp_path / "data" / "06_models" / "model.pkl").resolve().as_posix()), }) dummy_catalog._data_sets["model"].save(2) # emulate model fitting return dummy_catalog
def all_catalog(dataframex, dataframey, dataframey_bad): #создадим DF, так как они лежат в Кедро, после загрузки в память # https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html from kedro.io import DataCatalog, MemoryDataSet catalog = DataCatalog({ "dataframex": MemoryDataSet(), "dataframey": MemoryDataSet(), "dataframey_bad": MemoryDataSet() }) catalog.save("dataframex", dataframex) catalog.save("dataframey", dataframey) catalog.save("dataframey_bad", dataframey_bad) return catalog
def test_node_hook(tmp_path): mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True, recursive=True, sep="-") def fake_fun(arg1, arg2, arg3): return None node_test = node( func=fake_fun, inputs={ "arg1": "params:param1", "arg2": "foo", "arg3": "parameters" }, outputs="out", ) catalog = DataCatalog({ "params:param1": 1, "foo": MemoryDataSet(), "bar": MemoryDataSet(), "parameters": { "param1": 1, "param2": 2 }, }) node_inputs = { v: catalog._data_sets.get(v) for k, v in node_test._inputs.items() } mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow_node_hook.before_node_run( node=node_test, catalog=catalog, inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "param1": "1", "parameters-param1": "1", "parameters-param2": "2", }
def test_exists(self, new_data): """Test `exists` method invocation""" data_set = MemoryDataSet() assert not data_set.exists() data_set.save(new_data) assert data_set.exists()
def catalog_with_parameters(): catalog_with_parameters = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "params:stopwords": MemoryDataSet(["Hello", "Hi"]), "params:penalty": MemoryDataSet(0.1), "model": CSVDataSet("fake/path/to/model.csv"), "params:threshold": MemoryDataSet(0.5), }) return catalog_with_parameters
def test_memory_data_set_input(self, is_async, fan_out_fan_in): pipeline = Pipeline([fan_out_fan_in]) catalog = DataCatalog({"A": MemoryDataSet("42")}) result = ParallelRunner(is_async=is_async).run(pipeline, catalog) assert "Z" in result assert len(result["Z"]) == 3 assert result["Z"] == ("42", "42", "42")
def extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog: sub_catalog = DataCatalog() for data_set_name in self.inference.inputs(): if data_set_name == self.input_name: # there is no obligation that this dataset is persisted # thus it is allowed to be an empty memory dataset data_set = catalog._data_sets.get( data_set_name) or MemoryDataSet() sub_catalog.add(data_set_name=data_set_name, data_set=data_set) else: try: data_set = catalog._data_sets[data_set_name] if isinstance(data_set, MemoryDataSet): raise KedroMlflowPipelineMLDatasetsError(""" The datasets of the training pipeline must be persisted locally to be used by the inference pipeline. You must enforce them as non 'MemoryDataSet' in the 'catalog.yml'. Dataset '{data_set_name}' is not persisted currently. """.format(data_set_name=data_set_name)) sub_catalog.add(data_set_name=data_set_name, data_set=data_set) except KeyError: raise KedroMlflowPipelineMLDatasetsError(""" The provided catalog must contains '{data_set_name}' data_set since it is an input for inference pipeline. """.format(data_set_name=data_set_name)) return sub_catalog
def _init_dataset(self): if not getattr(self, "_ready", None): self._ready = True self.dataset_name = self.dataset_name or self._dataset_name _dataset = self.dataset if isinstance(self.dataset, str): dataset_dict = dataset_dicts.get( self.dataset, {"type": "pickle.PickleDataSet"}) dataset_dict["filepath"] = self.filepath = ( self.filepath or tempfile.gettempdir() + "/" + self.dataset_name + "." + self.dataset) _dataset = dataset_dict if isinstance(_dataset, dict): self._dataset = AbstractDataSet.from_config( self._dataset_name, _dataset) elif isinstance(_dataset, AbstractDataSet): self._dataset = _dataset else: raise ValueError( "The argument type of `dataset` should be either a dict/YAML " "representation of the dataset, or the actual dataset object." ) _filepath = getattr(self._dataset, "_filepath", None) if _filepath: self.filepath = str(_filepath) if self.caching and (not self._running_parallel): self._cache = MemoryDataSet(copy_mode=self.copy_mode)
def create_master_table(shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame) -> [pd.DataFrame, pd.DataFrame]: """Combines all data to create a master table. Args: shuttles: Preprocessed data for shuttles. companies: Preprocessed data for companies. reviews: Source data for reviews. Returns: Master table. """ rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id") with_companies = rated_shuttles.merge(companies, left_on="company_id", right_on="id") master_table = with_companies.drop(["shuttle_id", "company_id"], axis=1) master_table = master_table.dropna() input_ml_data = pd.DataFrame({'state': 'ready'}, index=[0]) input_ml = MemoryDataSet(data=input_ml_data) return master_table, input_ml
def __init__( self, propensity_model_filename="../data/06_models/propensity_model.pickle", uplift_models_filename="../data/06_models/uplift_models_dict.pickle", df_filename="../data/07_model_output/df.csv", treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv", untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv", estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv", args_raw=MemoryDataSet({}).load()): self.propensity_model = PickleLocalDataSet( filepath=propensity_model_filename, version=None) self.uplift_models_dict = PickleLocalDataSet( filepath=uplift_models_filename, version=None) self.df_03 = CSVLocalDataSet( filepath=df_filename, load_args=dict(index_col=["partition", "index"], float_precision="high"), save_args=dict(index=True, float_format="%.16e"), version=None, ) self.treated__sim_eval_df = CSVLocalDataSet( filepath=treated_sim_eval_filename, version=None) self.untreated__sim_eval_df = CSVLocalDataSet( filepath=untreated_sim_eval_filename, version=None) self.estimated_effect_df = CSVLocalDataSet( filepath=estimated_effect_filename, version=None) self.args_raw = args_raw
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog( { "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), "my_metrics": MlflowMetricsDataSet(), "another_metrics": MlflowMetricsDataSet(prefix="foo"), "my_metric": MlflowMetricDataSet(), "another_metric": MlflowMetricDataSet(key="foo"), "my_metric_history": MlflowMetricHistoryDataSet(), "another_metric_history": MlflowMetricHistoryDataSet(key="bar"), } ) return dummy_catalog