def run_split_pipeline(self, split_manager, elements, element_counter=None): with fn_api_runner.split_manager('Identity', split_manager): with self.create_pipeline() as p: res = (p | beam.Create(elements) | beam.Reshuffle() | 'Identity' >> beam.Map(lambda x: x) | beam.Map(lambda x: element_counter.increment() or x)) assert_that(res, equal_to(elements))
def run_split_pipeline(self, split_manager, elements, element_counter=None): with fn_api_runner.split_manager('Identity', split_manager): with self.create_pipeline() as p: res = (p | beam.Create(elements) | beam.Reshuffle() | 'Identity' >> beam.Map(lambda x: x) | beam.Map(lambda x: element_counter.increment() or x)) assert_that(res, equal_to(elements))
def run_sdf_split_pipeline(self, split_manager, elements, element_counter, expected_groups=None): # Define an SDF that for each input x produces [(x, k) for k in range(x)]. class EnumerateProvider(beam.transforms.core.RestrictionProvider): def initial_restriction(self, element): return (0, element) def create_tracker(self, restriction): return restriction_trackers.OffsetRestrictionTracker( *restriction) def split(self, element, restriction): # Don't do any initial splitting to simplify test. return [restriction] def restriction_size(self, element, restriction): return restriction[1] - restriction[0] class EnumerateSdf(beam.DoFn): def process(self, element, restriction_tracker=EnumerateProvider()): to_emit = [] for k in range(*restriction_tracker.current_restriction()): if restriction_tracker.try_claim(k): to_emit.append((element, k)) element_counter.increment() else: break # Emitting in batches for tighter testing. yield to_emit expected = [(e, k) for e in elements for k in range(e)] with fn_api_runner.split_manager('SDF', split_manager): with self.create_pipeline() as p: grouped = (p | beam.Create(elements) | 'SDF' >> beam.ParDo(EnumerateSdf())) flat = grouped | beam.FlatMap(lambda x: x) assert_that(flat, equal_to(expected)) if expected_groups: assert_that(grouped, equal_to(expected_groups), label='CheckGrouped')
def run_sdf_split_pipeline( self, split_manager, elements, element_counter, expected_groups=None): # Define an SDF that for each input x produces [(x, k) for k in range(x)]. class EnumerateProvider(beam.transforms.core.RestrictionProvider): def initial_restriction(self, element): return (0, element) def create_tracker(self, restriction): return restriction_trackers.OffsetRestrictionTracker( *restriction) def split(self, element, restriction): # Don't do any initial splitting to simplify test. return [restriction] def restriction_size(self, element, restriction): return restriction[1] - restriction[0] class EnumerateSdf(beam.DoFn): def process(self, element, restriction_tracker=EnumerateProvider()): to_emit = [] for k in range(*restriction_tracker.current_restriction()): if restriction_tracker.try_claim(k): to_emit.append((element, k)) element_counter.increment() else: break # Emitting in batches for tighter testing. yield to_emit expected = [(e, k) for e in elements for k in range(e)] with fn_api_runner.split_manager('SDF', split_manager): with self.create_pipeline() as p: grouped = ( p | beam.Create(elements) | 'SDF' >> beam.ParDo(EnumerateSdf())) flat = grouped | beam.FlatMap(lambda x: x) assert_that(flat, equal_to(expected)) if expected_groups: assert_that(grouped, equal_to(expected_groups), label='CheckGrouped')