Пример #1
0
    def test_multiple_loads(self, versioned_csv_data_set, dummy_dataframe,
                            filepath_csv):
        """Test that if a new version is created mid-run, by an
        external system, it won't be loaded in the current run."""
        versioned_csv_data_set.save(dummy_dataframe)
        versioned_csv_data_set.load()
        v1 = versioned_csv_data_set.resolve_load_version()

        sleep(0.5)
        # force-drop a newer version into the same location
        v_new = generate_timestamp()
        GenericDataSet(
            filepath=filepath_csv.as_posix(),
            file_format="csv",
            version=Version(v_new, v_new),
        ).save(dummy_dataframe)

        versioned_csv_data_set.load()
        v2 = versioned_csv_data_set.resolve_load_version()

        assert v2 == v1  # v2 should not be v_new!
        ds_new = GenericDataSet(
            filepath=filepath_csv.as_posix(),
            file_format="csv",
            version=Version(None, None),
        )
        assert (ds_new.resolve_load_version() == v_new
                )  # new version is discoverable by a new instance
Пример #2
0
 def test_catalog_release(self, mocker):
     fs_mock = mocker.patch("fsspec.filesystem").return_value
     filepath = "test.csv"
     data_set = GenericDataSet(filepath=filepath, file_format="sas")
     assert data_set._version_cache.currsize == 0  # no cache if unversioned
     data_set.release()
     fs_mock.invalidate_cache.assert_called_once_with(filepath)
     assert data_set._version_cache.currsize == 0
Пример #3
0
    def test_generic_no_filepaths(self, file_format):
        error = (
            "Cannot create a dataset of file_format "
            f"`{file_format}` as it does not support a filepath target/source")

        with pytest.raises(DataSetError, match=error):
            _ = GenericDataSet(filepath="/file/thing.file",
                               file_format=file_format).load()
        with pytest.raises(DataSetError, match=error):
            GenericDataSet(filepath="/file/thing.file",
                           file_format=file_format).save(pd.DataFrame([1]))
Пример #4
0
def versioned_csv_data_set(filepath_csv, load_version, save_version):
    return GenericDataSet(
        filepath=filepath_csv.as_posix(),
        file_format="csv",
        version=Version(load_version, save_version),
        save_args={"index": False},
    )
Пример #5
0
def sas_data_set_bad_config(filepath_sas, fs_args):
    return GenericDataSet(
        filepath=filepath_sas.as_posix(),
        file_format="sas",
        load_args={},  # SAS reader requires a type param
        fs_args=fs_args,
    )
Пример #6
0
def html_data_set(filepath_html, fs_args):
    return GenericDataSet(
        filepath=filepath_html.as_posix(),
        file_format="html",
        fs_args=fs_args,
        save_args={"index": False},
    )
Пример #7
0
def sas_data_set(filepath_sas, fs_args):
    return GenericDataSet(
        filepath=filepath_sas.as_posix(),
        file_format="sas",
        load_args={"format": "sas7bdat"},
        fs_args=fs_args,
    )
Пример #8
0
    def test_bad_file_format_argument(self):
        ds = GenericDataSet(filepath="test.kedro", file_format="kedro")

        pattern = (
            "Unable to retrieve `pandas.read_kedro` method, please ensure that your 'file_format' "
            "parameter has been defined correctly as per the Pandas API "
            "https://pandas.pydata.org/docs/reference/io.html")

        with pytest.raises(DataSetError, match=pattern):
            _ = ds.load()

        pattern2 = (
            "Unable to retrieve `pandas.DataFrame.to_kedro` method, please ensure that your 'file_format' "
            "parameter has been defined correctly as per the Pandas API "
            "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html"
        )
        with pytest.raises(DataSetError, match=pattern2):
            ds.save(pd.DataFrame([1]))
Пример #9
0
 def test_version_str_repr(self, filepath_csv, load_version, save_version):
     """Test that version is in string representation of the class instance
     when applicable."""
     filepath = filepath_csv.as_posix()
     ds = GenericDataSet(filepath=filepath, file_format="csv")
     ds_versioned = GenericDataSet(
         filepath=filepath,
         file_format="csv",
         version=Version(load_version, save_version),
     )
     assert filepath in str(ds)
     assert filepath in str(ds_versioned)
     ver_str = f"version=Version(load={load_version}, save='{save_version}')"
     assert ver_str in str(ds_versioned)
     assert "GenericDataSet" in str(ds_versioned)
     assert "GenericDataSet" in str(ds)
     assert "protocol" in str(ds_versioned)
     assert "protocol" in str(ds)
Пример #10
0
    def test_protocol_usage(self, filepath, instance_type, credentials):
        data_set = GenericDataSet(filepath=filepath,
                                  file_format="sas",
                                  credentials=credentials)
        assert isinstance(data_set._fs, instance_type)

        path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
Пример #11
0
def csv_data_set(filepath_csv):
    return GenericDataSet(
        filepath=filepath_csv.as_posix(),
        file_format="csv",
        save_args={"index": False},
    )
Пример #12
0
    def test_release_instance_cache(self, dummy_dataframe, filepath_csv):
        """Test that cache invalidation does not affect other instances"""
        ds_a = GenericDataSet(
            filepath=filepath_csv.as_posix(),
            file_format="csv",
            version=Version(None, None),
        )
        assert ds_a._version_cache.currsize == 0
        ds_a.save(dummy_dataframe)  # create a version
        assert ds_a._version_cache.currsize == 2

        ds_b = GenericDataSet(
            filepath=filepath_csv.as_posix(),
            file_format="csv",
            version=Version(None, None),
        )
        assert ds_b._version_cache.currsize == 0
        ds_b.resolve_save_version()
        assert ds_b._version_cache.currsize == 1
        ds_b.resolve_load_version()
        assert ds_b._version_cache.currsize == 2

        ds_a.release()

        # dataset A cache is cleared
        assert ds_a._version_cache.currsize == 0

        # dataset B cache is unaffected
        assert ds_b._version_cache.currsize == 2
Пример #13
0
    def test_multiple_saves(self, dummy_dataframe, filepath_csv):
        """Test multiple cycles of save followed by load for the same dataset"""
        ds_versioned = GenericDataSet(
            filepath=filepath_csv.as_posix(),
            file_format="csv",
            version=Version(None, None),
        )

        # first save
        ds_versioned.save(dummy_dataframe)
        first_save_version = ds_versioned.resolve_save_version()
        first_load_version = ds_versioned.resolve_load_version()
        assert first_load_version == first_save_version

        # second save
        sleep(0.5)
        ds_versioned.save(dummy_dataframe)
        second_save_version = ds_versioned.resolve_save_version()
        second_load_version = ds_versioned.resolve_load_version()
        assert second_load_version == second_save_version
        assert second_load_version > first_load_version

        # another dataset
        ds_new = GenericDataSet(
            filepath=filepath_csv.as_posix(),
            file_format="csv",
            version=Version(None, None),
        )
        assert ds_new.resolve_load_version() == second_load_version