def test_save_invalidates_cache(self, dataset, local_csvs): pds = PartitionedDataSet(str(local_csvs), dataset) first_load = pds.load() data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data.csv" pds.save({part_id: data}) assert part_id not in first_load assert part_id in pds.load()
def test_overwrite(self, local_csvs, overwrite, expected_num_parts): pds = PartitionedDataSet(str(local_csvs), "pandas.CSVDataSet", overwrite=overwrite) original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data" pds.save({part_id: original_data}) loaded_partitions = pds.load() assert part_id in loaded_partitions assert len(loaded_partitions.keys()) == expected_num_parts
def test_save(self, dataset, mocked_csvs_in_s3): pds = PartitionedDataSet(mocked_csvs_in_s3, dataset) original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data.csv" pds.save({part_id: original_data}) s3 = s3fs.S3FileSystem() assert s3.exists("/".join([mocked_csvs_in_s3, part_id])) loaded_partitions = pds.load() assert part_id in loaded_partitions reloaded_data = loaded_partitions[part_id]() assert_frame_equal(reloaded_data, original_data)
def test_save(self, dataset, local_csvs, suffix): pds = PartitionedDataSet(str(local_csvs), dataset, filename_suffix=suffix) original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data" pds.save({part_id: original_data}) assert (local_csvs / "new" / ("data" + suffix)).is_file() loaded_partitions = pds.load() assert part_id in loaded_partitions reloaded_data = loaded_partitions[part_id]() assert_frame_equal(reloaded_data, original_data)
def test_save_s3a(self, mocked_csvs_in_s3, mocker): """Test that save works in case of s3a protocol""" s3a_path = "s3a://{}".format(mocked_csvs_in_s3.split("://", 1)[1]) # any type is fine as long as it passes isinstance check # since _dataset_type is mocked later anyways pds = PartitionedDataSet(s3a_path, "pandas.CSVDataSet", filename_suffix=".csv") assert pds._protocol == "s3a" mocked_ds = mocker.patch.object(pds, "_dataset_type") mocked_ds.__name__ = "mocked" new_partition = "new/data" data = "data" pds.save({new_partition: data}) mocked_ds.assert_called_once_with( filepath="{}/{}.csv".format(s3a_path, new_partition)) mocked_ds.return_value.save.assert_called_once_with(data)
def test_save_invalidates_cache(self, local_csvs, mocker): """Test that save calls invalidate partition cache""" pds = PartitionedDataSet(str(local_csvs), "pandas.CSVDataSet") mocked_fs_invalidate = mocker.patch.object(pds._filesystem, "invalidate_cache") first_load = pds.load() assert pds._partition_cache.currsize == 1 mocked_fs_invalidate.assert_not_called() # save clears cache data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) new_partition = "new/data.csv" pds.save({new_partition: data}) assert pds._partition_cache.currsize == 0 # it seems that `_filesystem.invalidate_cache` calls itself inside, # resulting in not one, but 2 mock calls # hence using `assert_any_call` instead of `assert_called_once_with` mocked_fs_invalidate.assert_any_call(pds._normalized_path) # new load returns new partition too second_load = pds.load() assert new_partition not in first_load assert new_partition in second_load