def test_load_exact(self, mocker): ts = generate_timestamp() versioned_hdfs = SparkDataSet( filepath="hdfs://{}".format(HDFS_PREFIX), version=Version(ts, None) ) get_spark = mocker.patch.object(versioned_hdfs, "_get_spark") versioned_hdfs.load() get_spark.return_value.read.load.assert_called_once_with( "hdfs://{fn}/{f}/{v}/{f}".format(fn=FOLDER_NAME, f=FILENAME, v=ts), "parquet", )
def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker): class DummyContext(KedroContext): project_name = "bob" package_name = "bob" project_version = kedro_version def _get_pipelines(self) -> Dict[str, Pipeline]: return { "__default__": Pipeline([node(identity, "cars", "boats")]) } mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) filepath = (dummy_context.project_path / "cars.csv").as_posix() old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) sleep(0.5) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions) assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def test_load_exact(self, mocker): ts = generate_timestamp() ds_s3 = SparkDataSet( filepath="s3a://{}/{}".format(BUCKET_NAME, FILENAME), version=Version(ts, None), credentials=AWS_CREDENTIALS, ) get_spark = mocker.patch.object(ds_s3, "_get_spark") ds_s3.load() get_spark.return_value.read.load.assert_called_once_with( "s3a://{b}/{f}/{v}/{f}".format(b=BUCKET_NAME, f=FILENAME, v=ts), "parquet")
def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): """Test load and save of versioned data sets from config""" sane_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() journal = Journal({"run_id": "fake-id", "project_path": "fake-path"}) catalog = DataCatalog.from_config(**sane_config, load_versions={"boats": version}, save_version=version, journal=journal) assert catalog._journal == journal # pylint: disable=protected-access catalog.save("boats", dummy_dataframe) path = Path(sane_config["catalog"]["boats"]["filepath"]) path = path / version / path.name assert path.is_file() reloaded_df = catalog.load("boats") assert_frame_equal(reloaded_df, dummy_dataframe)
def test_multiple_loads(self, versioned_image_dataset, image_object, filepath_png): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" versioned_image_dataset.save(image_object) v1 = versioned_image_dataset.resolve_load_version() # force-drop a newer version into the same location v_new = generate_timestamp() ImageDataSet(filepath=filepath_png, version=Version(v_new, v_new)).save(image_object) v2 = versioned_image_dataset.resolve_load_version() assert v2 == v1 # v2 should not be v_new! ds_new = ImageDataSet(filepath=filepath_png, version=Version(None, None)) assert (ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance
def test_multiple_loads(self, versioned_csv_data_set, dummy_dataframe, filepath_csv): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" versioned_csv_data_set.save(dummy_dataframe) versioned_csv_data_set.load() v1 = versioned_csv_data_set.resolve_load_version() # force-drop a newer version into the same location v_new = generate_timestamp() CSVDataSet(filepath=filepath_csv, version=Version(v_new, v_new)).save(dummy_dataframe) versioned_csv_data_set.load() v2 = versioned_csv_data_set.resolve_load_version() assert v2 == v1 # v2 should not be v_new! ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert (ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance
def create( cls, project_path: Union[Path, str] = None, save_on_close: bool = True, env: str = None, ) -> "KedroSession": """Create a new instance of ``KedroSession``. Args: project_path: Path to the project root directory. save_on_close: Whether or not to save the session when it's closed. env: Environment for the KedroContext. Returns: A new ``KedroSession`` instance. """ # pylint: disable=protected-access session = cls( project_path=project_path, session_id=generate_timestamp(), save_on_close=save_on_close, ) session_data = get_static_project_data(session._project_path) session_data["project_path"] = session._project_path session_data["session_id"] = session.session_id session_data.update(_describe_git(session._project_path)) ctx = click.get_current_context(silent=True) if ctx: session_data["cli"] = _jsonify_cli_context(ctx) if env: session_data["env"] = env session._store.update(session_data) return session
def from_config( cls: Type, catalog: Optional[Dict[str, Dict[str, Any]]], credentials: Dict[str, Dict[str, Any]] = None, load_versions: Dict[str, str] = None, save_version: str = None, journal: Journal = None, ) -> "DataCatalog": """Create a ``DataCatalog`` instance from configuration. This is a factory method used to provide developers with a way to instantiate ``DataCatalog`` with configuration parsed from configuration files. Args: catalog: A dictionary whose keys are the data set names and the values are dictionaries with the constructor arguments for classes implementing ``AbstractDataSet``. The data set class to be loaded is specified with the key ``type`` and their fully qualified class name. All ``kedro.io`` data set can be specified by their class name only, i.e. their module name can be omitted. credentials: A dictionary containing credentials for different data sets. Use the ``credentials`` key in a ``AbstractDataSet`` to refer to the appropriate credentials as shown in the example below. load_versions: A mapping between dataset names and versions to load. Has no effect on data sets without enabled versioning. save_version: Version string to be used for ``save`` operations by all data sets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. journal: Instance of Journal. Returns: An instantiated ``DataCatalog`` containing all specified data sets, created and ready to use. Raises: DataSetError: When the method fails to create any of the data sets from their config. DataSetNotFoundError: When `load_versions` refers to a dataset that doesn't exist in the catalog. Example: :: >>> config = { >>> "cars": { >>> "type": "pandas.CSVDataSet", >>> "filepath": "cars.csv", >>> "save_args": { >>> "index": False >>> } >>> }, >>> "boats": { >>> "type": "pandas.CSVDataSet", >>> "filepath": "s3://aws-bucket-name/boats.csv", >>> "credentials": "boats_credentials" >>> "save_args": { >>> "index": False >>> } >>> } >>> } >>> >>> credentials = { >>> "boats_credentials": { >>> "client_kwargs": { >>> "aws_access_key_id": "<your key id>", >>> "aws_secret_access_key": "<your secret>" >>> } >>> } >>> } >>> >>> catalog = DataCatalog.from_config(config, credentials) >>> >>> df = catalog.load("cars") >>> catalog.save("boats", df) """ data_sets = {} catalog = copy.deepcopy(catalog) or {} credentials = copy.deepcopy(credentials) or {} run_id = journal.run_id if journal else None save_version = save_version or run_id or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} missing_keys = load_versions.keys() - catalog.keys() if missing_keys: raise DataSetNotFoundError( f"`load_versions` keys [{', '.join(sorted(missing_keys))}] " f"are not found in the catalog.") layers = defaultdict(set) # type: Dict[str, Set[str]] for ds_name, ds_config in catalog.items(): ds_layer = ds_config.pop("layer", None) if ds_layer is not None: layers[ds_layer].add(ds_name) ds_config = _resolve_credentials(ds_config, credentials) data_sets[ds_name] = AbstractDataSet.from_config( ds_name, ds_config, load_versions.get(ds_name), save_version) dataset_layers = layers or None return cls(data_sets=data_sets, journal=journal, layers=dataset_layers)
def save_version(request): return request.param or generate_timestamp()
def run( # pylint: disable=too-many-arguments,too-many-locals self, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, load_versions: Dict[str, str] = None, pipeline_name: str = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. pipeline_name: Name of the ``Pipeline`` to execute. Defaults to "__default__". Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Report project name logging.info("** Kedro project %s", self.project_path.name) try: pipeline = self._get_pipeline(name=pipeline_name) except NotImplementedError: common_migration_message = ( "`ProjectContext._get_pipeline(self, name)` method is expected. " "Please refer to the 'Modular Pipelines' section of the documentation." ) if pipeline_name: raise KedroContextError( "The project is not fully migrated to use multiple pipelines. " + common_migration_message ) warn( "You are using the deprecated pipeline construction mechanism. " + common_migration_message, DeprecationWarning, ) pipeline = self.pipeline filtered_pipeline = self._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, ) run_id = generate_timestamp() record_data = { "run_id": run_id, "project_path": str(self.project_path), "env": self.env, "kedro_version": self.project_version, "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "load_versions": load_versions, "pipeline_name": pipeline_name, } journal = Journal(record_data) catalog = self._get_catalog( save_version=run_id, journal=journal, load_versions=load_versions ) # Run the runner runner = runner or SequentialRunner() return runner.run(filtered_pipeline, catalog)
def create( # pylint: disable=too-many-arguments cls, package_name: str = None, project_path: Union[Path, str] = None, save_on_close: bool = True, env: str = None, extra_params: Dict[str, Any] = None, ) -> "KedroSession": """Create a new instance of ``KedroSession`` with the session data. Args: package_name: Package name for the Kedro project the session is created for. project_path: Path to the project root directory. Default is current working directory Path.cwd(). save_on_close: Whether or not to save the session when it's closed. env: Environment for the KedroContext. extra_params: Optional dictionary containing extra project parameters for underlying KedroContext. If specified, will update (and therefore take precedence over) the parameters retrieved from the project configuration. Returns: A new ``KedroSession`` instance. """ # this is to make sure that for workflows that manually create session # without going through one of our known entrypoints, e.g. some plugins like kedro-airflow, # the project is still properly configured. This is for backward compatibility # and should be removed in 0.18. if package_name is not None: configure_project(package_name) session = cls( package_name=package_name, project_path=project_path, session_id=generate_timestamp(), save_on_close=save_on_close, ) # have to explicitly type session_data otherwise mypy will complain # possibly related to this: https://github.com/python/mypy/issues/1430 session_data: Dict[str, Any] = { "package_name": session._package_name, "project_path": session._project_path, "session_id": session.session_id, **_describe_git(session._project_path), } ctx = click.get_current_context(silent=True) if ctx: session_data["cli"] = _jsonify_cli_context(ctx) env = env or os.getenv("KEDRO_ENV") if env: session_data["env"] = env if extra_params: session_data["extra_params"] = extra_params session._store.update(session_data) # we need a ConfigLoader registered in order to be able to set up logging session._setup_logging() return session
def version(): load_version = None # use latest save_version = generate_timestamp() # freeze save version return Version(load_version, save_version)
def from_config( cls: Type, catalog: Optional[Dict[str, Dict[str, Any]]], credentials: Dict[str, Dict[str, Any]] = None, load_versions: Dict[str, str] = None, save_version: str = None, journal: Journal = None, ) -> "DataCatalog": """Create a ``DataCatalog`` instance from configuration. This is a factory method used to provide developers with a way to instantiate ``DataCatalog`` with configuration parsed from configuration files. Args: catalog: A dictionary whose keys are the data set names and the values are dictionaries with the constructor arguments for classes implementing ``AbstractDataSet``. The data set class to be loaded is specified with the key ``type`` and their fully qualified class name. All ``kedro.io`` data set can be specified by their class name only, i.e. their module name can be omitted. credentials: A dictionary containing credentials for different data sets. Use the ``credentials`` key in a ``AbstractDataSet`` to refer to the appropriate credentials as shown in the example below. load_versions: A mapping between dataset names and versions to load. Has no effect on data sets without enabled versioning. save_version: Version string to be used for ``save`` operations by all data sets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. journal: Instance of Journal. Returns: An instantiated ``DataCatalog`` containing all specified data sets, created and ready to use. Raises: DataSetError: When the method fails to create any of the data sets from their config. Example: :: >>> config = { >>> "cars": { >>> "type": "CSVLocalDataSet", >>> "filepath": "cars.csv", >>> "save_args": { >>> "index": False >>> } >>> }, >>> "boats": { >>> "type": "CSVS3DataSet", >>> "filepath": "boats.csv", >>> "bucket_name": "mck-147789798-bucket", >>> "credentials": "boats_credentials" >>> "save_args": { >>> "index": False >>> } >>> } >>> } >>> >>> credentials = { >>> "boats_credentials": { >>> "aws_access_key_id": "<your key id>", >>> "aws_secret_access_key": "<your secret>" >>> } >>> } >>> >>> catalog = DataCatalog.from_config(config, credentials) >>> >>> df = catalog.load("cars") >>> catalog.save("boats", df) """ data_sets = {} catalog = copy.deepcopy(catalog) or {} credentials = copy.deepcopy(credentials) or {} run_id = journal.run_id if journal else None save_version = save_version or run_id or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} missing_keys = load_versions.keys() - catalog.keys() if missing_keys: warn("`load_versions` keys [{}] are not found in the catalog.". format(", ".join(sorted(missing_keys)))) for ds_name, ds_config in catalog.items(): if "type" not in ds_config: raise DataSetError("`type` is missing from DataSet '{}' " "catalog configuration".format(ds_name)) if CREDENTIALS_KEY in ds_config: ds_config[CREDENTIALS_KEY] = _get_credentials( ds_config.pop(CREDENTIALS_KEY), credentials # credentials name ) data_sets[ds_name] = AbstractDataSet.from_config( ds_name, ds_config, load_versions.get(ds_name), save_version) return cls(data_sets=data_sets, journal=journal)