예제 #1
0
 def _from_config(config, version):
     if VERSIONED_FLAG_KEY in config:
         raise ValueError(
             "Cached datasets should specify that they are versioned in the "
             "`CachedDataSet`, not in the wrapped dataset.")
     if version:
         config[VERSIONED_FLAG_KEY] = True
         return AbstractDataSet.from_config("_cached", config, version.load,
                                            version.save)
     return AbstractDataSet.from_config("_cached", config)
예제 #2
0
    def _init_dataset(self):

        if not getattr(self, "_ready", None):
            self._ready = True
            self.dataset_name = self.dataset_name or self._dataset_name
            _dataset = self.dataset
            if isinstance(self.dataset, str):
                dataset_dict = dataset_dicts.get(
                    self.dataset, {"type": "pickle.PickleDataSet"})
                dataset_dict["filepath"] = self.filepath = (
                    self.filepath or tempfile.gettempdir() + "/" +
                    self.dataset_name + "." + self.dataset)
                _dataset = dataset_dict

            if isinstance(_dataset, dict):
                self._dataset = AbstractDataSet.from_config(
                    self._dataset_name, _dataset)
            elif isinstance(_dataset, AbstractDataSet):
                self._dataset = _dataset
            else:
                raise ValueError(
                    "The argument type of `dataset` should be either a dict/YAML "
                    "representation of the dataset, or the actual dataset object."
                )

            _filepath = getattr(self._dataset, "_filepath", None)
            if _filepath:
                self.filepath = str(_filepath)

            if self.caching and (not self._running_parallel):
                self._cache = MemoryDataSet(copy_mode=self.copy_mode)
예제 #3
0
    def from_config(
        cls: Type,
        catalog: Optional[Dict[str, Dict[str, Any]]],
        credentials: Dict[str, Dict[str, Any]] = None,
        load_versions: Dict[str, str] = None,
        save_version: str = None,
        journal: Journal = None,
    ) -> "DataCatalog":
        """Create a ``DataCatalog`` instance from configuration. This is a
        factory method used to provide developers with a way to instantiate
        ``DataCatalog`` with configuration parsed from configuration files.

        Args:
            catalog: A dictionary whose keys are the data set names and
                the values are dictionaries with the constructor arguments
                for classes implementing ``AbstractDataSet``. The data set
                class to be loaded is specified with the key ``type`` and their
                fully qualified class name. All ``kedro.io`` data set can be
                specified by their class name only, i.e. their module name
                can be omitted.
            credentials: A dictionary containing credentials for different
                data sets. Use the ``credentials`` key in a ``AbstractDataSet``
                to refer to the appropriate credentials as shown in the example
                below.
            load_versions: A mapping between dataset names and versions
                to load. Has no effect on data sets without enabled versioning.
            save_version: Version string to be used for ``save`` operations
                by all data sets with enabled versioning. It must: a) be a
                case-insensitive string that conforms with operating system
                filename limitations, b) always return the latest version when
                sorted in lexicographical order.
            journal: Instance of Journal.

        Returns:
            An instantiated ``DataCatalog`` containing all specified
            data sets, created and ready to use.

        Raises:
            DataSetError: When the method fails to create any of the data
                sets from their config.
            DataSetNotFoundError: When `load_versions` refers to a dataset that doesn't
                exist in the catalog.

        Example:
        ::

            >>> config = {
            >>>     "cars": {
            >>>         "type": "pandas.CSVDataSet",
            >>>         "filepath": "cars.csv",
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     },
            >>>     "boats": {
            >>>         "type": "pandas.CSVDataSet",
            >>>         "filepath": "s3://aws-bucket-name/boats.csv",
            >>>         "credentials": "boats_credentials"
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     }
            >>> }
            >>>
            >>> credentials = {
            >>>     "boats_credentials": {
            >>>         "client_kwargs": {
            >>>             "aws_access_key_id": "<your key id>",
            >>>             "aws_secret_access_key": "<your secret>"
            >>>         }
            >>>      }
            >>> }
            >>>
            >>> catalog = DataCatalog.from_config(config, credentials)
            >>>
            >>> df = catalog.load("cars")
            >>> catalog.save("boats", df)
        """
        data_sets = {}
        catalog = copy.deepcopy(catalog) or {}
        credentials = copy.deepcopy(credentials) or {}
        run_id = journal.run_id if journal else None
        save_version = save_version or run_id or generate_timestamp()
        load_versions = copy.deepcopy(load_versions) or {}

        missing_keys = load_versions.keys() - catalog.keys()
        if missing_keys:
            raise DataSetNotFoundError(
                f"`load_versions` keys [{', '.join(sorted(missing_keys))}] "
                f"are not found in the catalog.")

        layers = defaultdict(set)  # type: Dict[str, Set[str]]
        for ds_name, ds_config in catalog.items():
            ds_layer = ds_config.pop("layer", None)
            if ds_layer is not None:
                layers[ds_layer].add(ds_name)

            ds_config = _resolve_credentials(ds_config, credentials)
            data_sets[ds_name] = AbstractDataSet.from_config(
                ds_name, ds_config, load_versions.get(ds_name), save_version)

        dataset_layers = layers or None
        return cls(data_sets=data_sets, journal=journal, layers=dataset_layers)
예제 #4
0
    def from_config(
        cls: Type,
        catalog: Optional[Dict[str, Dict[str, Any]]],
        credentials: Dict[str, Dict[str, Any]] = None,
        load_versions: Dict[str, str] = None,
        save_version: str = None,
    ) -> "DataCatalog":
        """Create a ``DataCatalog`` instance from configuration. This is a
        factory method used to provide developers with a way to instantiate
        ``DataCatalog`` with configuration parsed from configuration files.

        Args:
            catalog: A dictionary whose keys are the data set names and
                the values are dictionaries with the constructor arguments
                for classes implementing ``AbstractDataSet``. The data set
                class to be loaded is specified with the key ``type`` and their
                fully qualified class name. All ``kedro.io`` data set can be
                specified by their class name only, i.e. their module name
                can be omitted.
            credentials: A dictionary containing credentials for different
                data sets. Use the ``credentials`` key in a ``AbstractDataSet``
                to refer to the appropriate credentials as shown in the example
                below.
            load_versions: A mapping between dataset names and versions
                to load. Has no effect on data sets without enabled versioning.
            save_version: Version string to be used for ``save`` operations
                by all data sets with enabled versioning. It must: a) be a
                case-insensitive string that conforms with operating system
                filename limitations, b) always return the latest version when
                sorted in lexicographical order.

        Returns:
            An instantiated ``DataCatalog`` containing all specified
            data sets, created and ready to use.

        Raises:
            DataSetError: When the method fails to create any of the data
                sets from their config.

        Example:
        ::

            >>> config = {
            >>>     "cars": {
            >>>         "type": "CSVLocalDataSet",
            >>>         "filepath": "cars.csv",
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     },
            >>>     "boats": {
            >>>         "type": "CSVS3DataSet",
            >>>         "filepath": "boats.csv",
            >>>         "bucket_name": "mck-147789798-bucket",
            >>>         "credentials": "boats_credentials"
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     }
            >>> }
            >>>
            >>> credentials = {
            >>>     "boats_credentials": {
            >>>         "aws_access_key_id": "<your key id>",
            >>>         "aws_secret_access_key": "<your secret>"
            >>>      }
            >>> }
            >>>
            >>> catalog = DataCatalog.from_config(config, credentials)
            >>>
            >>> df = catalog.load("cars")
            >>> catalog.save("boats", df)
        """
        data_sets = {}
        catalog = copy.deepcopy(catalog) or {}
        credentials = copy.deepcopy(credentials) or {}
        save_version = save_version or generate_current_version()
        load_versions = copy.deepcopy(load_versions) or {}

        for ds_name, ds_config in catalog.items():
            if "type" not in ds_config:
                raise DataSetError("`type` is missing from DataSet '{}' "
                                   "catalog configuration".format(ds_name))
            if CREDENTIALS_KEY in ds_config:
                ds_config[CREDENTIALS_KEY] = _get_credentials(
                    ds_config.pop(CREDENTIALS_KEY),
                    credentials  # credentials name
                )
            data_sets[ds_name] = AbstractDataSet.from_config(
                ds_name, ds_config, load_versions.get(ds_name), save_version)
        return cls(data_sets=data_sets)