예제 #1
0
 def test_filename_suffix(self, filename_suffix, expected_partitions, local_csvs):
     """Test how specifying filename_suffix affects the available
     partitions and their names"""
     pds = IncrementalDataSet(
         str(local_csvs), DATASET, filename_suffix=filename_suffix
     )
     loaded = pds.load()
     assert loaded.keys() == expected_partitions
예제 #2
0
 def test_comparison_func(self, comparison_func, expected_partitions, local_csvs):
     """Test that specifying a custom function for comparing the checkpoint value
     to a partition id results in expected partitions being returned on load"""
     checkpoint_config = {
         "force_checkpoint": "p02/data.csv",
         "comparison_func": comparison_func,
     }
     pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=checkpoint_config)
     assert pds.load().keys() == expected_partitions
예제 #3
0
    def test_force_checkpoint_no_partitions(self, forced_checkpoint, local_csvs):
        """Test that forcing the checkpoint to certain values results in no
        partitions being returned"""
        pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=forced_checkpoint)
        loaded = pds.load()
        assert loaded == {}

        confirm_path = local_csvs / pds.DEFAULT_CHECKPOINT_FILENAME
        assert not confirm_path.exists()
        pds.confirm()
        # confirming with no partitions available must have no effect
        assert not confirm_path.exists()
예제 #4
0
    def test_force_checkpoint_no_partitions(self, forced_checkpoint, mocked_csvs_in_s3):
        """Test that forcing the checkpoint to certain values results in no
        partitions returned from S3"""
        pds = IncrementalDataSet(
            mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint
        )
        loaded = pds.load()
        assert loaded == {}

        assert not pds._checkpoint.exists()
        pds.confirm()
        # confirming with no partitions available must have no effect
        assert not pds._checkpoint.exists()
예제 #5
0
    def test_force_checkpoint_checkpoint_file_exists(
        self, forced_checkpoint, expected_partitions, local_csvs
    ):
        """Test how forcing checkpoint value affects the available partitions
        if the checkpoint file exists"""
        IncrementalDataSet(str(local_csvs), DATASET).confirm()
        checkpoint = local_csvs / IncrementalDataSet.DEFAULT_CHECKPOINT_FILENAME
        assert checkpoint.read_text() == "p04/data.csv"

        pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=forced_checkpoint)
        assert pds._checkpoint.exists()
        loaded = pds.load()
        assert loaded.keys() == expected_partitions
예제 #6
0
    def test_force_checkpoint_no_checkpoint_file(
        self, forced_checkpoint, expected_partitions, local_csvs
    ):
        """Test how forcing checkpoint value affects the available partitions
        if the checkpoint file does not exist"""
        pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=forced_checkpoint)
        loaded = pds.load()
        assert loaded.keys() == expected_partitions

        confirm_path = local_csvs / pds.DEFAULT_CHECKPOINT_FILENAME
        assert not confirm_path.exists()
        pds.confirm()
        assert confirm_path.is_file()
        assert confirm_path.read_text() == max(expected_partitions)
예제 #7
0
    def test_force_checkpoint_no_checkpoint_file(
        self, forced_checkpoint, expected_partitions, mocked_csvs_in_s3
    ):
        """Test how forcing checkpoint value affects the available partitions
        in S3 if the checkpoint file does not exist"""
        pds = IncrementalDataSet(
            mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint
        )
        loaded = pds.load()
        assert loaded.keys() == expected_partitions

        assert not pds._checkpoint.exists()
        pds.confirm()
        assert pds._checkpoint.exists()
        assert pds._checkpoint.load() == max(expected_partitions)
예제 #8
0
 def test_credentials(self, pds_config, fs_creds, dataset_creds, checkpoint_creds):
     """Test correctness of credentials propagation into the dataset and
     checkpoint constructors"""
     pds = IncrementalDataSet(str(Path.cwd()), **pds_config)
     assert pds._credentials == fs_creds
     assert pds._dataset_config[CREDENTIALS_KEY] == dataset_creds
     assert pds._checkpoint_config[CREDENTIALS_KEY] == checkpoint_creds
예제 #9
0
 def test_version_not_allowed(self, tmp_path, checkpoint_config,
                              error_pattern):
     """Test that invalid checkpoint configurations raise expected errors"""
     with pytest.raises(DataSetError, match=re.escape(error_pattern)):
         IncrementalDataSet(str(tmp_path),
                            DATASET,
                            checkpoint=checkpoint_config)
예제 #10
0
 def test_checkpoint_type(self, tmp_path, checkpoint_config,
                          expected_checkpoint_class):
     """Test configuring a different checkpoint dataset type"""
     pds = IncrementalDataSet(str(tmp_path),
                              DATASET,
                              checkpoint=checkpoint_config)
     assert isinstance(pds._checkpoint, expected_checkpoint_class)
예제 #11
0
    def test_force_checkpoint_checkpoint_file_exists(self, forced_checkpoint,
                                                     expected_partitions,
                                                     mocked_csvs_in_s3):
        """Test how forcing checkpoint value affects the available partitions
        in S3 if the checkpoint file exists"""
        # create checkpoint and assert that it exists
        IncrementalDataSet(mocked_csvs_in_s3, DATASET).confirm()
        checkpoint_path = "{}/{}".format(
            mocked_csvs_in_s3, IncrementalDataSet.DEFAULT_CHECKPOINT_FILENAME)
        checkpoint_value = TextDataSet(checkpoint_path).load()
        assert checkpoint_value == "p04/data.csv"

        pds = IncrementalDataSet(mocked_csvs_in_s3,
                                 DATASET,
                                 checkpoint=forced_checkpoint)
        assert pds._checkpoint.exists()
        loaded = pds.load()
        assert loaded.keys() == expected_partitions
예제 #12
0
    def test_checkpoint_path(self, local_csvs, partitioned_data_pandas):
        """Test configuring a different checkpoint path"""
        checkpoint_path = local_csvs / "checkpoint_folder" / "checkpoint_file"
        assert not checkpoint_path.exists()

        IncrementalDataSet(
            str(local_csvs), DATASET, checkpoint={"filepath": str(checkpoint_path)}
        ).confirm()
        assert checkpoint_path.is_file()
        assert checkpoint_path.read_text() == max(partitioned_data_pandas)
예제 #13
0
    def test_load_and_confirm(self, local_csvs, partitioned_data_pandas):
        """Test the standard flow for loading, confirming and reloading
        an IncrementalDataSet"""
        pds = IncrementalDataSet(str(local_csvs), DATASET)
        loaded = pds.load()
        assert loaded.keys() == partitioned_data_pandas.keys()
        for partition_id, data in loaded.items():
            assert_frame_equal(data, partitioned_data_pandas[partition_id])

        checkpoint_path = local_csvs / pds.DEFAULT_CHECKPOINT_FILENAME
        assert not checkpoint_path.exists()
        pds.confirm()
        assert checkpoint_path.is_file()
        assert checkpoint_path.read_text() == pds._read_checkpoint() == "p04/data.csv"

        reloaded = pds.load()
        assert reloaded.keys() == loaded.keys()

        pds.release()
        reloaded_after_release = pds.load()
        assert reloaded_after_release == {}
예제 #14
0
    def test_load_and_confirm(self, mocked_csvs_in_s3, partitioned_data_pandas):
        """Test the standard flow for loading, confirming and reloading
        a IncrementalDataSet in S3"""
        pds = IncrementalDataSet(mocked_csvs_in_s3, DATASET)
        assert pds._checkpoint._protocol == "s3"
        loaded = pds.load()
        assert loaded.keys() == partitioned_data_pandas.keys()
        for partition_id, data in loaded.items():
            assert_frame_equal(data, partitioned_data_pandas[partition_id])

        assert not pds._checkpoint.exists()
        assert pds._read_checkpoint() is None
        pds.confirm()
        assert pds._checkpoint.exists()
        assert pds._read_checkpoint() == max(partitioned_data_pandas)
예제 #15
0
    def test_load_and_confirm_s3a(self, mocked_csvs_in_s3,
                                  partitioned_data_pandas, mocker):
        s3a_path = f"s3a://{mocked_csvs_in_s3.split('://', 1)[1]}"
        pds = IncrementalDataSet(s3a_path, DATASET)
        assert pds._protocol == "s3a"
        assert pds._checkpoint._protocol == "s3"

        mocked_ds = mocker.patch.object(pds, "_dataset_type")
        mocked_ds.__name__ = "mocked"
        loaded = pds.load()

        assert loaded.keys() == partitioned_data_pandas.keys()
        assert not pds._checkpoint.exists()
        assert pds._read_checkpoint() is None
        pds.confirm()
        assert pds._checkpoint.exists()
        assert pds._read_checkpoint() == max(partitioned_data_pandas)
예제 #16
0
    def test_save(self, local_csvs):
        """Test saving a new partition into an IncrementalDataSet"""
        df = pd.DataFrame({"dummy": [1, 2, 3]})
        new_partition_key = "p05/data.csv"
        new_partition_path = local_csvs / new_partition_key
        pds = IncrementalDataSet(str(local_csvs), DATASET)

        assert not new_partition_path.exists()
        assert new_partition_key not in pds.load()

        pds.save({new_partition_key: df})
        assert new_partition_path.exists()
        loaded = pds.load()
        assert_frame_equal(loaded[new_partition_key], df)