def test_run_load_versions(self, dummy_context, dummy_dataframe): filepath = (dummy_context.project_path / "cars.csv").as_posix() old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) sleep(0.5) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions, pipeline_name="simple") assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker): class DummyContext(KedroContext): project_name = "bob" package_name = "bob" project_version = kedro_version def _get_pipelines(self) -> Dict[str, Pipeline]: return {"__default__": Pipeline([node(identity, "cars", "boats")])} mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) filepath = str(dummy_context.project_path / "cars.csv") old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions) assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def test_load_options_csv(self, tmp_path, sample_pandas_df): filepath = str(tmp_path / "data") local_csv_data_set = CSVDataSet(filepath=filepath) local_csv_data_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=filepath, file_format="csv", load_args={"header": True}) spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1
def test_release_instance_cache(self, dummy_dataframe, filepath_csv): """Test that cache invalidation does not affect other instances""" ds_a = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert ds_a._version_cache.currsize == 0 ds_a.save(dummy_dataframe) # create a version assert ds_a._version_cache.currsize == 2 ds_b = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert ds_b._version_cache.currsize == 0 ds_b.resolve_save_version() assert ds_b._version_cache.currsize == 1 ds_b.resolve_load_version() assert ds_b._version_cache.currsize == 2 ds_a.release() # dataset A cache is cleared assert ds_a._version_cache.currsize == 0 # dataset B cache is unaffected assert ds_b._version_cache.currsize == 2
def test_multiple_saves(self, dummy_dataframe, filepath_csv): """Test multiple cycles of save followed by load for the same dataset""" ds_versioned = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) # first save ds_versioned.save(dummy_dataframe) first_save_version = ds_versioned.resolve_save_version() first_load_version = ds_versioned.resolve_load_version() assert first_load_version == first_save_version # second save ds_versioned.save(dummy_dataframe) second_save_version = ds_versioned.resolve_save_version() second_load_version = ds_versioned.resolve_load_version() assert second_load_version == second_save_version assert second_load_version > first_load_version # another dataset ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) assert ds_new.resolve_load_version() == second_load_version
# Copyright 2018-2019 QuantumBlack Visual Analytics Limited