예제 #1
0
    def test_run_load_versions(self, dummy_context, dummy_dataframe):
        filepath = (dummy_context.project_path / "cars.csv").as_posix()

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        sleep(0.5)
        new_save_version = generate_timestamp()
        new_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions, pipeline_name="simple")
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)
예제 #2
0
    def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker):
        class DummyContext(KedroContext):
            project_name = "bob"
            package_name = "bob"
            project_version = kedro_version

            def _get_pipelines(self) -> Dict[str, Pipeline]:
                return {"__default__": Pipeline([node(identity, "cars", "boats")])}

        mocker.patch("logging.config.dictConfig")
        dummy_context = DummyContext(str(tmp_path))
        filepath = str(dummy_context.project_path / "cars.csv")

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        new_save_version = generate_timestamp()
        new_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions)
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)
예제 #3
0
 def test_load_options_csv(self, tmp_path, sample_pandas_df):
     filepath = str(tmp_path / "data")
     local_csv_data_set = CSVDataSet(filepath=filepath)
     local_csv_data_set.save(sample_pandas_df)
     spark_data_set = SparkDataSet(filepath=filepath,
                                   file_format="csv",
                                   load_args={"header": True})
     spark_df = spark_data_set.load()
     assert spark_df.filter(col("Name") == "Alex").count() == 1
예제 #4
0
    def test_release_instance_cache(self, dummy_dataframe, filepath_csv):
        """Test that cache invalidation does not affect other instances"""
        ds_a = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert ds_a._version_cache.currsize == 0
        ds_a.save(dummy_dataframe)  # create a version
        assert ds_a._version_cache.currsize == 2

        ds_b = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert ds_b._version_cache.currsize == 0
        ds_b.resolve_save_version()
        assert ds_b._version_cache.currsize == 1
        ds_b.resolve_load_version()
        assert ds_b._version_cache.currsize == 2

        ds_a.release()

        # dataset A cache is cleared
        assert ds_a._version_cache.currsize == 0

        # dataset B cache is unaffected
        assert ds_b._version_cache.currsize == 2
예제 #5
0
    def test_multiple_saves(self, dummy_dataframe, filepath_csv):
        """Test multiple cycles of save followed by load for the same dataset"""
        ds_versioned = CSVDataSet(filepath=filepath_csv,
                                  version=Version(None, None))

        # first save
        ds_versioned.save(dummy_dataframe)
        first_save_version = ds_versioned.resolve_save_version()
        first_load_version = ds_versioned.resolve_load_version()
        assert first_load_version == first_save_version

        # second save
        ds_versioned.save(dummy_dataframe)
        second_save_version = ds_versioned.resolve_save_version()
        second_load_version = ds_versioned.resolve_load_version()
        assert second_load_version == second_save_version
        assert second_load_version > first_load_version

        # another dataset
        ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert ds_new.resolve_load_version() == second_load_version
예제 #6
0
# Copyright 2018-2019 QuantumBlack Visual Analytics Limited