def test_save_invalidates_cache(self, dataset, local_csvs):
        pds = PartitionedDataSet(str(local_csvs), dataset)
        first_load = pds.load()

        data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]})
        part_id = "new/data.csv"
        pds.save({part_id: data})
        assert part_id not in first_load
        assert part_id in pds.load()
    def test_overwrite(self, local_csvs, overwrite, expected_num_parts):
        pds = PartitionedDataSet(str(local_csvs),
                                 "pandas.CSVDataSet",
                                 overwrite=overwrite)
        original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]})
        part_id = "new/data"
        pds.save({part_id: original_data})
        loaded_partitions = pds.load()

        assert part_id in loaded_partitions
        assert len(loaded_partitions.keys()) == expected_num_parts
示例#3
0
    def test_save(self, dataset, mocked_csvs_in_s3):
        pds = PartitionedDataSet(mocked_csvs_in_s3, dataset)
        original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]})
        part_id = "new/data.csv"
        pds.save({part_id: original_data})

        s3 = s3fs.S3FileSystem()
        assert s3.exists("/".join([mocked_csvs_in_s3, part_id]))

        loaded_partitions = pds.load()
        assert part_id in loaded_partitions
        reloaded_data = loaded_partitions[part_id]()
        assert_frame_equal(reloaded_data, original_data)
示例#4
0
    def test_save(self, dataset, local_csvs, suffix):
        pds = PartitionedDataSet(str(local_csvs),
                                 dataset,
                                 filename_suffix=suffix)
        original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]})
        part_id = "new/data"
        pds.save({part_id: original_data})

        assert (local_csvs / "new" / ("data" + suffix)).is_file()
        loaded_partitions = pds.load()
        assert part_id in loaded_partitions
        reloaded_data = loaded_partitions[part_id]()
        assert_frame_equal(reloaded_data, original_data)
    def test_save_s3a(self, mocked_csvs_in_s3, mocker):
        """Test that save works in case of s3a protocol"""
        s3a_path = "s3a://{}".format(mocked_csvs_in_s3.split("://", 1)[1])
        # any type is fine as long as it passes isinstance check
        # since _dataset_type is mocked later anyways
        pds = PartitionedDataSet(s3a_path,
                                 "pandas.CSVDataSet",
                                 filename_suffix=".csv")
        assert pds._protocol == "s3a"

        mocked_ds = mocker.patch.object(pds, "_dataset_type")
        mocked_ds.__name__ = "mocked"
        new_partition = "new/data"
        data = "data"

        pds.save({new_partition: data})
        mocked_ds.assert_called_once_with(
            filepath="{}/{}.csv".format(s3a_path, new_partition))
        mocked_ds.return_value.save.assert_called_once_with(data)
示例#6
0
    def test_save_invalidates_cache(self, local_csvs, mocker):
        """Test that save calls invalidate partition cache"""
        pds = PartitionedDataSet(str(local_csvs), "pandas.CSVDataSet")
        mocked_fs_invalidate = mocker.patch.object(pds._filesystem,
                                                   "invalidate_cache")
        first_load = pds.load()
        assert pds._partition_cache.currsize == 1
        mocked_fs_invalidate.assert_not_called()

        # save clears cache
        data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]})
        new_partition = "new/data.csv"
        pds.save({new_partition: data})
        assert pds._partition_cache.currsize == 0
        # it seems that `_filesystem.invalidate_cache` calls itself inside,
        # resulting in not one, but 2 mock calls
        # hence using `assert_any_call` instead of `assert_called_once_with`
        mocked_fs_invalidate.assert_any_call(pds._normalized_path)

        # new load returns new partition too
        second_load = pds.load()
        assert new_partition not in first_load
        assert new_partition in second_load