def test_load_parquet(self, tmp_path, sample_pandas_df): temp_path = str(tmp_path / "data") local_parquet_set = ParquetDataSet(filepath=temp_path) local_parquet_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=temp_path) spark_df = spark_data_set.load() assert spark_df.count() == 4
def test_save_and_load_non_existing_dir(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set to non-existing directory.""" filepath = (tmp_path / "non-existing" / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded)
def test_save_and_load(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set.""" filepath = (tmp_path / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded) assert data_set._fs_open_args_load == {} files = [child.is_file() for child in tmp_path.iterdir()] assert all(files) assert len(files) == 1
def test_write_to_dir(self, dummy_dataframe, tmp_path): data_set = ParquetDataSet(filepath=tmp_path.as_posix()) pattern = "Saving ParquetDataSet to a directory is not supported" with pytest.raises(DataSetError, match=pattern): data_set.save(dummy_dataframe)