def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = ( read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x))) v2 = ( read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x))) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
def expand(self, pcoll): filter_batchable_mutations = ( pcoll | 'Making mutation groups' >> ParDo(_MakeMutationGroupsFn()) | 'Filtering Batchable Mutations' >> ParDo( _BatchableFilterFn( max_batch_size_bytes=self._max_batch_size_bytes, max_number_rows=self._max_number_rows, max_number_cells=self._max_number_cells)).with_outputs( _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE, main='batchable')) batching_batchables = ( filter_batchable_mutations['batchable'] | ParDo( _BatchFn(max_batch_size_bytes=self._max_batch_size_bytes, max_number_rows=self._max_number_rows, max_number_cells=self._max_number_cells))) return ((batching_batchables, filter_batchable_mutations[ _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE]) | 'Merging batchable and unbatchable' >> Flatten())