Exemplo n.º 1
0
 def test_read_all_from_parquet_many_file_patterns(self):
     file_pattern1 = self._write_pattern(5)
     file_pattern2 = self._write_pattern(2)
     file_pattern3 = self._write_pattern(3)
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([file_pattern1, file_pattern2, file_pattern3]) \
             | ReadAllFromParquet(),
             equal_to(self.RECORDS * 10))
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([file_pattern1, file_pattern2, file_pattern3]) \
             | ReadAllFromParquetBatched(),
             equal_to([self._records_as_arrow()] * 10))
Exemplo n.º 2
0
 def test_read_all_from_parquet_many_single_files(self):
     path1 = self._write_data()
     path2 = self._write_data()
     path3 = self._write_data()
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([path1, path2, path3]) \
             | ReadAllFromParquet(),
             equal_to(self.RECORDS * 3))
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([path1, path2, path3]) \
             | ReadAllFromParquetBatched(),
             equal_to([self._records_as_arrow()] * 3))
Exemplo n.º 3
0
 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (
       read
       | 'get_number' >> Map(lambda x: x['number'])
       | 'sum_globally' >> CombineGlobally(sum)
       | 'validate_number' >>
       FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)))
   v2 = (
       read
       | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
       | 'count_per_key' >> Count.PerKey()
       | 'validate_name' >> FlatMap(
           lambda x: TestParquetIT._count_verifier(init_size, data_size, x)))
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))