def test_read_all_from_parquet_file_pattern(self): file_pattern = self._write_pattern(5) with TestPipeline() as p: assert_that( p \ | Create([file_pattern]) \ | ReadAllFromParquet(), equal_to(self.RECORDS * 5)) with TestPipeline() as p: assert_that( p \ | Create([file_pattern]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()] * 5))
def test_read_all_from_parquet_single_file(self): path = self._write_data() with TestPipeline() as p: assert_that( p \ | Create([path]) \ | ReadAllFromParquet(), equal_to(self.RECORDS)) with TestPipeline() as p: assert_that( p \ | Create([path]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()]))
def test_read_all_from_parquet_many_file_patterns(self): file_pattern1 = self._write_pattern(5) file_pattern2 = self._write_pattern(2) file_pattern3 = self._write_pattern(3) with TestPipeline() as p: assert_that( p \ | Create([file_pattern1, file_pattern2, file_pattern3]) \ | ReadAllFromParquet(), equal_to(self.RECORDS * 10)) with TestPipeline() as p: assert_that( p \ | Create([file_pattern1, file_pattern2, file_pattern3]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()] * 10))
def test_read_all_from_parquet_many_single_files(self): path1 = self._write_data() path2 = self._write_data() path3 = self._write_data() with TestPipeline() as p: assert_that( p \ | Create([path1, path2, path3]) \ | ReadAllFromParquet(), equal_to(self.RECORDS * 3)) with TestPipeline() as p: assert_that( p \ | Create([path1, path2, path3]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()] * 3))