Python DataCatalog.add примеры использования

Язык программирования: Python

Пространство имен/Пакет: kedro.io

Класс/Тип: DataCatalog

Метод/Функция: add

Примеров на hotexamples.com: 10

Python DataCatalog.add - 10 примеров найдено. Это лучшие примеры Python кода для kedro.io.DataCatalog.add, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

from_config(30)

save(16)

load(15)

DataCatalog(13)

add(10)

_get_dataset(5)

release(5)

add_transformer(4)

list(4)

confirm(3)

exists(3)

shallow_copy(2)

add_all(1)

add_feed_dict(1)

set_version_journal(1)

Пример #1

Показать файл

Файл: kedro_pipeline_model.py Проект: Galileo-Galilei/kedro-mlflow

    def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:

        sub_catalog = DataCatalog()
        for data_set_name in self.pipeline.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # and even if it is, we keep only an ampty memory dataset to avoid
                # extra uneccessary dependencies: this dataset will be replaced at
                # inference time and we do not need to know the original type, see
                # https://github.com/Galileo-Galilei/kedro-mlflow/issues/273
                sub_catalog.add(data_set_name=data_set_name,
                                data_set=MemoryDataSet())
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(
                            data_set, MemoryDataSet
                    ) and not data_set_name.startswith("params:"):
                        raise KedroPipelineModelError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    self._logger.info(
                        f"The data_set '{data_set_name}' is added to the Pipeline catalog."
                    )
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroPipelineModelError((
                        f"The provided catalog must contains '{data_set_name}' data_set "
                        "since it is the input of the pipeline."))

        return sub_catalog

Пример #2

Показать файл

    def extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:
        sub_catalog = DataCatalog()
        for data_set_name in self.inference.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # thus it is allowed to be an empty memory dataset
                data_set = catalog._data_sets.get(
                    data_set_name) or MemoryDataSet()
                sub_catalog.add(data_set_name=data_set_name, data_set=data_set)
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(data_set, MemoryDataSet):
                        raise KedroMlflowPipelineMLDatasetsError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroMlflowPipelineMLDatasetsError("""
                                The provided catalog must contains '{data_set_name}' data_set
                                since it is an input for inference pipeline.
                                """.format(data_set_name=data_set_name))

        return sub_catalog

Пример #3

Показать файл

    def run(self, pipeline: Pipeline, catalog: DataCatalog) -> Dict[str, Any]:
        """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``
        and save results back to the same objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """

        catalog = catalog.shallow_copy()

        unsatisfied = pipeline.inputs() - set(catalog.list())
        if unsatisfied:
            raise ValueError("Pipeline input(s) {} not found in the "
                             "DataCatalog".format(unsatisfied))

        free_outputs = pipeline.outputs() - set(catalog.list())
        unregistered_ds = pipeline.data_sets() - set(catalog.list())
        for ds_name in unregistered_ds:
            catalog.add(ds_name, self.create_default_data_set(ds_name))

        self._run(pipeline, catalog)

        self._logger.info("Pipeline execution completed successfully.")

        return {ds_name: catalog.load(ds_name) for ds_name in free_outputs}

Пример #4

Показать файл

    def test_add_save_and_load(self, data_set, dummy_dataframe):
        """Test adding and then saving and reloading the data set"""
        catalog = DataCatalog(data_sets={})
        catalog.add("test", data_set)
        catalog.save("test", dummy_dataframe)
        reloaded_df = catalog.load("test")

        assert_frame_equal(reloaded_df, dummy_dataframe)

Пример #5

Показать файл

    def test_all_before_adding(self, fake_data_set, fake_transformer):
        catalog = DataCatalog()
        catalog.add_transformer(fake_transformer)
        catalog.add("test", fake_data_set)

        catalog.save("test", 42)
        assert catalog.load("test") == 44
        assert fake_data_set.log == [("save", 43), ("load", 43)]
        assert fake_transformer.log == [("save", 42), ("load", 43)]

Пример #6

Показать файл

class KedroPipelineModel(PythonModel):
    def __init__(self, pipeline_ml: PipelineML, catalog: DataCatalog):

        self.pipeline_ml = pipeline_ml
        self.initial_catalog = pipeline_ml._extract_pipeline_catalog(catalog)
        self.loaded_catalog = DataCatalog()
        # we have the guarantee that there is only one output in inference
        self.output_name = list(pipeline_ml.inference.outputs())[0]

    def load_context(self, context):

        # a consistency check is made when loading the model
        # it would be better to check when saving the model
        # but we rely on a mlflow function for saving, and it is unaware of kedro
        # pipeline structure
        mlflow_artifacts_keys = set(context.artifacts.keys())
        kedro_artifacts_keys = set(
            self.pipeline_ml.inference.inputs() - {self.pipeline_ml.input_name}
        )
        if mlflow_artifacts_keys != kedro_artifacts_keys:
            in_artifacts_but_not_inference = (
                mlflow_artifacts_keys - kedro_artifacts_keys
            )
            in_inference_but_not_artifacts = (
                kedro_artifacts_keys - mlflow_artifacts_keys
            )
            raise ValueError(
                (
                    "Provided artifacts do not match catalog entries:"
                    f"\n    - 'artifacts - inference.inputs()' = : {in_artifacts_but_not_inference}"
                    f"\n    - 'inference.inputs() - artifacts' = : {in_inference_but_not_artifacts}"
                )
            )

        self.loaded_catalog = deepcopy(self.initial_catalog)
        for name, uri in context.artifacts.items():
            self.loaded_catalog._data_sets[name]._filepath = Path(uri)

    def predict(self, context, model_input):
        # TODO : checkout out how to pass extra args in predict
        # for instance, to enable parallelization

        self.loaded_catalog.add(
            data_set_name=self.pipeline_ml.input_name,
            data_set=MemoryDataSet(model_input),
            replace=True,
        )
        runner = SequentialRunner()
        run_outputs = runner.run(
            pipeline=self.pipeline_ml.inference, catalog=self.loaded_catalog
        )
        return run_outputs[
            self.output_name
        ]  # unpack the result to avoid messing the json output

Пример #7

Показать файл

    def before_pipeline_run(self, run_params: Dict, pipeline: Pipeline,
                            catalog: DataCatalog):
        if not self._enabled:
            return

        logger.info("KedroWings is Enabled")
        all_dataset_names = set([
            ds for node in pipeline.nodes
            for ds in [inp for inp in node.inputs] +
            [outp for outp in node.outputs]
        ])

        catalog_entries = self._create_catalog_entries(all_dataset_names)

        existing_catalog_names = set(catalog.list())
        for catalog_name, catalog_dataset in catalog_entries.items():
            if catalog_name in existing_catalog_names:
                continue
            catalog.add(catalog_name, catalog_dataset)

Пример #8

Показать файл

    def _extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:

        # check that the pipeline is consistent in case its attributes have been
        # modified manually
        self._check_consistency()

        sub_catalog = DataCatalog()
        for data_set_name in self.inference.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # thus it is allowed to be an empty memory dataset
                data_set = catalog._data_sets.get(
                    data_set_name) or MemoryDataSet()
                sub_catalog.add(data_set_name=data_set_name, data_set=data_set)
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(
                            data_set, MemoryDataSet
                    ) and not data_set_name.startswith("params:"):
                        raise KedroMlflowPipelineMLDatasetsError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    self._logger.info(
                        f"The data_set '{data_set_name}' is added to the PipelineML catalog."
                    )
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroMlflowPipelineMLDatasetsError("""
                                The provided catalog must contains '{data_set_name}' data_set
                                since it is an input for inference pipeline.
                                """.format(data_set_name=data_set_name))

        return sub_catalog

Пример #9

Показать файл

def sample_data_catalog_train(sample_data: pd.DataFrame) -> DataCatalog:
    """Generate data catalog for end to end feature engineering pipeline test.

    Args:
        sample_data (pd.DataFrame): Some sample training data.

    Returns:
        DataCatalog: Data catalog with sample training data.
    """
    catalog = DataCatalog()

    catalog.add("iris", MemoryDataSet(data=sample_data))
    catalog.add("params:target", MemoryDataSet(data="species"))
    catalog.add("params:test_fraction", MemoryDataSet(data=0.25))
    catalog.add("params:seed", MemoryDataSet(data=42))

    return catalog

Пример #10

Показать файл

Файл: test_pipeline_from_missing.py Проект: zeta1999/kedro

def _make_catalog(
    existent=None, non_existent=None, no_exists_method=None, feed_dict=None
):
    """Creates a catalog of existent and non-existent DataSets."""
    existent = [] if existent is None else existent
    non_existent = [] if non_existent is None else non_existent
    no_exists_method = [] if no_exists_method is None else no_exists_method

    catalog = DataCatalog(feed_dict=feed_dict)
    for source in existent:
        catalog.add(source, LambdaDataSet(None, None, lambda: True))
    for source in non_existent:
        catalog.add(source, LambdaDataSet(None, None, lambda: False))
    # Some LambdaDataSet do not have exists() method
    for source in no_exists_method:
        catalog.add(source, LambdaDataSet(None, None))
    return catalog