def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog: sub_catalog = DataCatalog() for data_set_name in self.pipeline.inputs(): if data_set_name == self.input_name: # there is no obligation that this dataset is persisted # and even if it is, we keep only an ampty memory dataset to avoid # extra uneccessary dependencies: this dataset will be replaced at # inference time and we do not need to know the original type, see # https://github.com/Galileo-Galilei/kedro-mlflow/issues/273 sub_catalog.add(data_set_name=data_set_name, data_set=MemoryDataSet()) else: try: data_set = catalog._data_sets[data_set_name] if isinstance( data_set, MemoryDataSet ) and not data_set_name.startswith("params:"): raise KedroPipelineModelError(""" The datasets of the training pipeline must be persisted locally to be used by the inference pipeline. You must enforce them as non 'MemoryDataSet' in the 'catalog.yml'. Dataset '{data_set_name}' is not persisted currently. """.format(data_set_name=data_set_name)) self._logger.info( f"The data_set '{data_set_name}' is added to the Pipeline catalog." ) sub_catalog.add(data_set_name=data_set_name, data_set=data_set) except KeyError: raise KedroPipelineModelError(( f"The provided catalog must contains '{data_set_name}' data_set " "since it is the input of the pipeline.")) return sub_catalog
def extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog: sub_catalog = DataCatalog() for data_set_name in self.inference.inputs(): if data_set_name == self.input_name: # there is no obligation that this dataset is persisted # thus it is allowed to be an empty memory dataset data_set = catalog._data_sets.get( data_set_name) or MemoryDataSet() sub_catalog.add(data_set_name=data_set_name, data_set=data_set) else: try: data_set = catalog._data_sets[data_set_name] if isinstance(data_set, MemoryDataSet): raise KedroMlflowPipelineMLDatasetsError(""" The datasets of the training pipeline must be persisted locally to be used by the inference pipeline. You must enforce them as non 'MemoryDataSet' in the 'catalog.yml'. Dataset '{data_set_name}' is not persisted currently. """.format(data_set_name=data_set_name)) sub_catalog.add(data_set_name=data_set_name, data_set=data_set) except KeyError: raise KedroMlflowPipelineMLDatasetsError(""" The provided catalog must contains '{data_set_name}' data_set since it is an input for inference pipeline. """.format(data_set_name=data_set_name)) return sub_catalog
def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]: """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ catalog = catalog.shallow_copy() unsatisfied = pipeline.inputs() - set(catalog.list()) if unsatisfied: raise ValueError("Pipeline input(s) {} not found in the " "DataCatalog".format(unsatisfied)) free_outputs = pipeline.outputs() - set(catalog.list()) unregistered_ds = pipeline.data_sets() - set(catalog.list()) for ds_name in unregistered_ds: catalog.add(ds_name, self.create_default_data_set(ds_name)) self._run(pipeline, catalog) self._logger.info("Pipeline execution completed successfully.") return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}
def test_add_save_and_load(self, data_set, dummy_dataframe): """Test adding and then saving and reloading the data set""" catalog = DataCatalog(data_sets={}) catalog.add("test", data_set) catalog.save("test", dummy_dataframe) reloaded_df = catalog.load("test") assert_frame_equal(reloaded_df, dummy_dataframe)
def test_all_before_adding(self, fake_data_set, fake_transformer): catalog = DataCatalog() catalog.add_transformer(fake_transformer) catalog.add("test", fake_data_set) catalog.save("test", 42) assert catalog.load("test") == 44 assert fake_data_set.log == [("save", 43), ("load", 43)] assert fake_transformer.log == [("save", 42), ("load", 43)]
class KedroPipelineModel(PythonModel): def __init__(self, pipeline_ml: PipelineML, catalog: DataCatalog): self.pipeline_ml = pipeline_ml self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog) self.loaded_catalog = DataCatalog() # we have the guarantee that there is only one output in inference self.output_name = list(pipeline_ml.inference.outputs())[0] def load_context(self, context): # a consistency check is made when loading the model # it would be better to check when saving the model # but we rely on a mlflow function for saving, and it is unaware of kedro # pipeline structure mlflow_artifacts_keys = set(context.artifacts.keys()) kedro_artifacts_keys = set( self.pipeline_ml.inference.inputs() - {self.pipeline_ml.input_name} ) if mlflow_artifacts_keys != kedro_artifacts_keys: in_artifacts_but_not_inference = ( mlflow_artifacts_keys - kedro_artifacts_keys ) in_inference_but_not_artifacts = ( kedro_artifacts_keys - mlflow_artifacts_keys ) raise ValueError( ( "Provided artifacts do not match catalog entries:" f"\n - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}" f"\n - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}" ) ) self.loaded_catalog = deepcopy(self.initial_catalog) for name, uri in context.artifacts.items(): self.loaded_catalog._data_sets[name]._filepath = Path(uri) def predict(self, context, model_input): # TODO : checkout out how to pass extra args in predict # for instance, to enable parallelization self.loaded_catalog.add( data_set_name=self.pipeline_ml.input_name, data_set=MemoryDataSet(model_input), replace=True, ) runner = SequentialRunner() run_outputs = runner.run( pipeline=self.pipeline_ml.inference, catalog=self.loaded_catalog ) return run_outputs[ self.output_name ] # unpack the result to avoid messing the json output
def before_pipeline_run(self, run_params: Dict, pipeline: Pipeline, catalog: DataCatalog): if not self._enabled: return logger.info("KedroWings is Enabled") all_dataset_names = set([ ds for node in pipeline.nodes for ds in [inp for inp in node.inputs] + [outp for outp in node.outputs] ]) catalog_entries = self._create_catalog_entries(all_dataset_names) existing_catalog_names = set(catalog.list()) for catalog_name, catalog_dataset in catalog_entries.items(): if catalog_name in existing_catalog_names: continue catalog.add(catalog_name, catalog_dataset)
def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog: # check that the pipeline is consistent in case its attributes have been # modified manually self._check_consistency() sub_catalog = DataCatalog() for data_set_name in self.inference.inputs(): if data_set_name == self.input_name: # there is no obligation that this dataset is persisted # thus it is allowed to be an empty memory dataset data_set = catalog._data_sets.get( data_set_name) or MemoryDataSet() sub_catalog.add(data_set_name=data_set_name, data_set=data_set) else: try: data_set = catalog._data_sets[data_set_name] if isinstance( data_set, MemoryDataSet ) and not data_set_name.startswith("params:"): raise KedroMlflowPipelineMLDatasetsError(""" The datasets of the training pipeline must be persisted locally to be used by the inference pipeline. You must enforce them as non 'MemoryDataSet' in the 'catalog.yml'. Dataset '{data_set_name}' is not persisted currently. """.format(data_set_name=data_set_name)) self._logger.info( f"The data_set '{data_set_name}' is added to the PipelineML catalog." ) sub_catalog.add(data_set_name=data_set_name, data_set=data_set) except KeyError: raise KedroMlflowPipelineMLDatasetsError(""" The provided catalog must contains '{data_set_name}' data_set since it is an input for inference pipeline. """.format(data_set_name=data_set_name)) return sub_catalog
def sample_data_catalog_train(sample_data: pd.DataFrame) -> DataCatalog: """Generate data catalog for end to end feature engineering pipeline test. Args: sample_data (pd.DataFrame): Some sample training data. Returns: DataCatalog: Data catalog with sample training data. """ catalog = DataCatalog() catalog.add("iris", MemoryDataSet(data=sample_data)) catalog.add("params:target", MemoryDataSet(data="species")) catalog.add("params:test_fraction", MemoryDataSet(data=0.25)) catalog.add("params:seed", MemoryDataSet(data=42)) return catalog
def _make_catalog( existent=None, non_existent=None, no_exists_method=None, feed_dict=None ): """Creates a catalog of existent and non-existent DataSets.""" existent = [] if existent is None else existent non_existent = [] if non_existent is None else non_existent no_exists_method = [] if no_exists_method is None else no_exists_method catalog = DataCatalog(feed_dict=feed_dict) for source in existent: catalog.add(source, LambdaDataSet(None, None, lambda: True)) for source in non_existent: catalog.add(source, LambdaDataSet(None, None, lambda: False)) # Some LambdaDataSet do not have exists() method for source in no_exists_method: catalog.add(source, LambdaDataSet(None, None)) return catalog