def test_config_missing_type(self, sane_config): """Check the error if type attribute is missing for some data set(s) in the config""" del sane_config["catalog"]["boats"]["type"] pattern = r"`type` is missing from DataSet \'boats\' " r"catalog configuration" with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(**sane_config)
def test_from_sane_config_load_versions_warn(self, sane_config): sane_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() load_version = {"non-boart": version} pattern = r"\`load_versions\` keys \[non-boart\] are not found in the catalog\." with pytest.warns(UserWarning, match=pattern): DataCatalog.from_config(**sane_config, load_versions=load_version)
def test_config_invalid_arguments(self, sane_config): """Check the error if the data set config contains invalid arguments""" sane_config["catalog"]["boats"]["save_and_load_args"] = False pattern = (r"DataSet 'boats' must only contain arguments valid for " r"the constructor of `.*CSVDataSet`") with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(**sane_config)
def test_config_invalid_data_set(self, sane_config): """Check the error if the type points to invalid class""" sane_config["catalog"]["boats"]["type"] = "DataCatalog" pattern = (r"DataSet 'boats' type `.*DataCatalog` is invalid: all " r"data set types must extend `AbstractDataSet`") with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(**sane_config)
def test_config_relative_import(self, sane_config): """Check the error if the type points to a relative import""" sane_config["catalog"]["boats"]["type"] = ".CSVDataSetInvalid" pattern = "`type` class path does not support relative paths" with pytest.raises(DataSetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config)
def test_config_invalid_module(self, sane_config): """Check the error if the type points to nonexistent module""" sane_config["catalog"]["boats"][ "type"] = "kedro.invalid_module_name.io.CSVDataSet" error_msg = "Class `kedro.invalid_module_name.io.CSVDataSet` not found" with pytest.raises(DataSetError, match=re.escape(error_msg)): DataCatalog.from_config(**sane_config)
def test_config_missing_type(self, sane_config): """Check the error if type attribute is missing for some data set(s) in the config""" del sane_config["catalog"]["boats"]["type"] pattern = ( "An exception occurred when parsing config for DataSet `boats`:\n" "`type` is missing from DataSet catalog configuration") with pytest.raises(DataSetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config)
def test_config_missing_class(self, sane_config): """Check the error if the type points to nonexistent class""" sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDataSetInvalid" pattern = ( "An exception occurred when parsing config for DataSet `boats`:\n" "Class `kedro.io.CSVDataSetInvalid` not found") with pytest.raises(DataSetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config)
def test_config_invalid_data_set(self, sane_config): """Check the error if the type points to invalid class""" sane_config["catalog"]["boats"]["type"] = "DataCatalog" pattern = ( "An exception occurred when parsing config for DataSet `boats`:\n" "DataSet type `kedro.io.data_catalog.DataCatalog` is invalid: " "all data set types must extend `AbstractDataSet`") with pytest.raises(DataSetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config)
def test_config_missing_class(self, sane_config): """Check the error if the type points to nonexistent class""" sane_config["catalog"]["boats"][ "type"] = "kedro.io.CSVLocalDataSetInvalid" pattern = ( r"Class `kedro.io.CSVLocalDataSetInvalid` for DataSet `boats` not found." ) with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(**sane_config)
def test_link_credentials(self, sane_config, mocker): """Test credentials being linked to the relevant data set""" mock_client = mocker.patch("kedro.extras.datasets.pandas.csv_dataset.fsspec") config = deepcopy(sane_config) del config["catalog"]["boats"] DataCatalog.from_config(**config) expected_client_kwargs = sane_config["credentials"]["s3_credentials"] mock_client.filesystem.assert_called_with("s3", **expected_client_kwargs)
def test_missing_dependency(self, sane_config, mocker): """Test that dependency is missing.""" pattern = "dependency issue" import_error = ImportError(pattern) import_error.name = pattern # import_error.name cannot be None mocker.patch("kedro.io.core.load_obj", side_effect=import_error) with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(**sane_config)
def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): """Check the warning if `version` attribute was added to the data set config""" sane_config["catalog"]["boats"]["versioned"] = versioned sane_config["catalog"]["boats"]["version"] = True DataCatalog.from_config(**sane_config) log_record = caplog.records[0] assert log_record.levelname == "WARNING" assert ("`version` attribute removed from `boats` data set " "configuration since it is a reserved word and cannot be " "directly specified" in log_record.message)
def test_nested_credentials(self, sane_config_with_nested_creds, mocker): mock_client = mocker.patch("kedro.io.csv_s3.S3FileSystem") DataCatalog.from_config(**sane_config_with_nested_creds) expected_client_kwargs = { "nested": { "credentials": { "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", } }, "key": "secret", } mock_client.assert_called_once_with(client_kwargs=expected_client_kwargs)
def test_missing_dependency(self, sane_config, mocker): """Test that dependency is missing.""" pattern = "dependency issue" # pylint: disable=unused-argument,inconsistent-return-statements def dummy_load(obj_path, *args, **kwargs): if obj_path == "kedro.extras.datasets.pandas.CSVDataSet": raise AttributeError(pattern) if obj_path == "kedro.extras.datasets.pandas.__all__": return ["CSVDataSet"] mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) with pytest.raises(DataSetError, match=pattern): DataCatalog.from_config(**sane_config)
def test_link_credentials(self, sane_config, mocker): """Test credentials being linked to the relevant data set""" mock_client = mocker.patch("kedro.io.csv_s3.S3FileSystem") DataCatalog.from_config(**sane_config) expected_client_kwargs = { "aws_access_key_id": sane_config["credentials"]["s3_credentials"]["aws_access_key_id"], "aws_secret_access_key": sane_config["credentials"]["s3_credentials"] ["aws_secret_access_key"], } mock_client.assert_called_once_with( client_kwargs=expected_client_kwargs)
def kedro_catalog(): from kedro.config import ConfigLoader from kedro.io import DataCatalog conf_paths = ['conf/base', 'conf/local'] conf_loader = ConfigLoader(conf_paths) conf_catalog = conf_loader.get('catalog*', 'catalog*/**') return DataCatalog.from_config(conf_catalog)
def _create_catalog( # pylint: disable=no-self-use,too-many-arguments self, conf_catalog: Dict[str, Any], conf_creds: Dict[str, Any], save_version: str = None, journal: Journal = None, load_versions: Dict[str, str] = None, ) -> DataCatalog: """A factory method for the DataCatalog instantiation. Returns: DataCatalog defined in `catalog.yml`. """ hook_manager = get_hook_manager() catalog = hook_manager.hook.register_catalog( # pylint: disable=no-member catalog=conf_catalog, credentials=conf_creds, load_versions=load_versions, save_version=save_version, journal=journal, ) return catalog or DataCatalog.from_config( # for backwards compatibility conf_catalog, conf_creds, load_versions, save_version, journal)
def create_catalog(config: ConfigLoader, **kwargs) -> DataCatalog: """Loads Kedro's ``DataCatalog``. Args: config: ConfigLoader which can be queried to access the project config. kwargs: Ignore any additional arguments added in the future. Returns: DataCatalog defined in `catalog.yml`. """ conf_logging = config.get("logging*", "logging*/**") logging.config.dictConfig(conf_logging) conf_catalog = config.get("catalog*", "catalog*/**") try: conf_creds = config.get("credentials*", "credentials*/**") except MissingConfigException: warn("Your Kedro project is missing a credentials file!") conf_creds = None conf_params = config.get("parameters*", "parameters*/**") logging.config.dictConfig(conf_logging) catalog = DataCatalog.from_config(conf_catalog, conf_creds) catalog.add_feed_dict({"parameters": conf_params}) return catalog
def test_config_bad_version(self): config = yaml.safe_load(StringIO(YML_CONFIG_VERSIONED_BAD)) with pytest.raises( DataSetError, match=r"Cached datasets should specify that they are " r"versioned in the `CachedDataSet`, not in the " r"wrapped dataset", ): _ = DataCatalog.from_config(config, load_versions={"test_ds": "42"})
def test_nested_credentials(self, sane_config_with_nested_creds, mocker): mock_client = mocker.patch("kedro.extras.datasets.pandas.csv_dataset.fsspec") config = deepcopy(sane_config_with_nested_creds) del config["catalog"]["boats"] DataCatalog.from_config(**config) expected_client_kwargs = { "client_kwargs": { "credentials": { "client_kwargs": { "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", } } }, "key": "secret", } mock_client.filesystem.assert_called_once_with("s3", **expected_client_kwargs)
def register_catalog( self, catalog: Optional[Dict[str, Dict[str, Any]]], credentials: Dict[str, Dict[str, Any]], load_versions: Dict[str, str], save_version: str, ) -> DataCatalog: return DataCatalog.from_config(catalog, credentials, load_versions, save_version)
def test_from_sane_config_default(sane_config, dummy_dataframe, tmpdir): catalog = DataCatalog.from_config(sane_config["catalog"], sane_config["credentials"]) catalog_with_default = DataCatalogWithDefault.from_data_catalog( catalog, default_csv) path = str(tmpdir.mkdir("sub").join("missing.csv")) catalog_with_default.save(path, dummy_dataframe) reloaded_df = catalog_with_default.load(path) assert dummy_dataframe.equals(reloaded_df)
def _create_catalog( # pylint: disable=no-self-use self, conf_catalog: Dict[str, Any], conf_creds: Dict[str, Any]) -> DataCatalog: """A hook for changing the creation of the DataCatalog instance. Returns: DataCatalog defined in `catalog.yml`. """ return DataCatalog.from_config(conf_catalog, conf_creds)
def _create_catalog( # pylint: disable=no-self-use,too-many-arguments self, conf_catalog: Dict[str, Any], conf_creds: Dict[str, Any], save_version: str = None, journal: Journal = None, load_versions: Dict[str, str] = None, ) -> DataCatalog: return DataCatalog.from_config(conf_catalog, conf_creds, load_versions, save_version, journal)
def test_LV3_to_decimalWSG84_2(self): conf_loader = ConfigLoader(['conf/base']) conf_catalog = conf_loader.get('catalog*', 'catalog/**') catalog = DataCatalog.from_config(conf_catalog) df = catalog.load("foehn_stations") lon_fun, lat_fun = LV3_to_decimalWSG84(x=df["x_LV03"], y=df["y_LV03"]) # There are some deviations in the data from MeteoSwiss, thus higher atol assert_allclose(lon_fun, df["longitude"], atol=0.01) assert_allclose(lat_fun, df["latitude"], atol=0.01)
def test_fill_missing_coordinates(self): conf_loader = ConfigLoader(['conf/base']) conf_catalog = conf_loader.get('catalog*', 'catalog/**') catalog = DataCatalog.from_config(conf_catalog) df = catalog.load("fire_data_cleansed") assert df[[ "coordinates_x", "coordinates_y", "longitude", "latitude", "municipality" ]].isnull().sum().sum() == 0
def test_load_version(self, sane_config, dummy_dataframe, mocker): """Test load versioned data sets from config""" new_dataframe = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) sane_config["catalog"]["boats"]["versioned"] = True mocker.patch( "kedro.io.data_catalog.generate_timestamp", side_effect=["first", "second"] ) # save first version of the dataset catalog = DataCatalog.from_config(**sane_config) catalog.save("boats", dummy_dataframe) # save second version of the dataset catalog = DataCatalog.from_config(**sane_config) catalog.save("boats", new_dataframe) assert_frame_equal(catalog.load("boats", version="first"), dummy_dataframe) assert_frame_equal(catalog.load("boats", version="second"), new_dataframe) assert_frame_equal(catalog.load("boats"), new_dataframe)
def test_load_version_on_unversioned_dataset( self, sane_config, dummy_dataframe, mocker ): mocker.patch("kedro.io.data_catalog.generate_timestamp", return_value="first") catalog = DataCatalog.from_config(**sane_config) catalog.save("boats", dummy_dataframe) with pytest.raises(DataSetError): catalog.load("boats", version="first")
def _create_catalog(self, conf_catalog, conf_creds) -> DataCatalog: save_version = generate_current_version() run_id = pai.current_run_uuid() if run_id: save_version += "-" + run_id return DataCatalog.from_config(conf_catalog, conf_creds, save_version=save_version)