def test_optimize_multiple_combine_globally(self): class MultipleCombines(beam.PTransform): def annotations(self): return {python_urns.APPLY_COMBINER_PACKING: b''} def expand(self, pcoll): _ = pcoll | 'mean-globally' >> combiners.Mean.Globally() _ = pcoll | 'count-globally' >> combiners.Count.Globally() _ = pcoll | 'largest-globally' >> core.CombineGlobally( combiners.Largest(1)) pipeline = beam.Pipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] _ = pipeline | Create(vals) | MultipleCombines() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ translations.pack_combiners, ], known_runner_urns=frozenset(), partial=True) # Tests that Pipeline.from_runner_api() does not throw an exception. runner = runners.DirectRunner() beam.Pipeline.from_runner_api( optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_pipeline_from_sorted_stages_is_toplogically_ordered(self): pipeline = beam.Pipeline() side = pipeline | 'side' >> Create([3, 4]) class CreateAndMultiplyBySide(beam.PTransform): def expand(self, pcoll): return ( pcoll | 'main' >> Create([1, 2]) | 'compute' >> beam.FlatMap( lambda x, s: [x * y for y in s], beam.pvalue.AsIter(side))) _ = pipeline | 'create-and-multiply-by-side' >> CreateAndMultiplyBySide() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ (lambda stages, _: reversed(list(stages))), translations.sort_stages, ], known_runner_urns=frozenset(), partial=True) def assert_is_topologically_sorted(transform_id, visited_pcolls): transform = optimized_pipeline_proto.components.transforms[transform_id] self.assertTrue(set(transform.inputs.values()).issubset(visited_pcolls)) visited_pcolls.update(transform.outputs.values()) for subtransform in transform.subtransforms: assert_is_topologically_sorted(subtransform, visited_pcolls) self.assertEqual(len(optimized_pipeline_proto.root_transform_ids), 1) assert_is_topologically_sorted( optimized_pipeline_proto.root_transform_ids[0], set())
def test_optimize_empty_pipeline(self): pipeline = beam.Pipeline() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [], known_runner_urns=frozenset(), partial=True) runner = runners.DirectRunner() beam.Pipeline.from_runner_api(optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_optimize_single_combine_globally(self): pipeline = beam.Pipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] _ = pipeline | Create(vals) | combiners.Count.Globally() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ translations.pack_combiners, ], known_runner_urns=frozenset(), partial=True) # Tests that Pipeline.from_runner_api() does not throw an exception. runner = runners.DirectRunner() beam.Pipeline.from_runner_api( optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_conditionally_packed_combiners(self): class RecursiveCombine(beam.PTransform): def __init__(self, labels): self._labels = labels def expand(self, pcoll): base = pcoll | 'Sum' >> beam.CombineGlobally(sum) if self._labels: rest = pcoll | self._labels[0] >> RecursiveCombine( self._labels[1:]) return (base, rest) | beam.Flatten() else: return base def annotations(self): if len(self._labels) == 2: return {python_urns.APPLY_COMBINER_PACKING: b''} else: return {} # Verify the results are as expected. with TestPipeline() as pipeline: result = pipeline | beam.Create([1, 2, 3 ]) | RecursiveCombine('ABCD') assert_that(result, equal_to([6, 6, 6, 6, 6])) # Verify the optimization is as expected. proto = pipeline.to_runner_api( default_environment=environments.EmbeddedPythonEnvironment( capabilities=environments.python_sdk_capabilities())) optimized = translations.optimize_pipeline( proto, phases=[translations.pack_combiners], known_runner_urns=frozenset(), partial=True) optimized_stage_names = sorted( t.unique_name for t in optimized.components.transforms.values()) self.assertIn('RecursiveCombine/Sum/CombinePerKey', optimized_stage_names) self.assertIn('RecursiveCombine/A/Sum/CombinePerKey', optimized_stage_names) self.assertNotIn('RecursiveCombine/A/B/Sum/CombinePerKey', optimized_stage_names) self.assertIn( 'RecursiveCombine/A/B/Packed[Sum_CombinePerKey, ' 'C_Sum_CombinePerKey, C_D_Sum_CombinePerKey]/Pack', optimized_stage_names)
def test_optimize_multiple_combine_globally(self): pipeline = beam.Pipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | Create(vals) _ = pcoll | 'mean-globally' >> combiners.Mean.Globally() _ = pcoll | 'count-globally' >> combiners.Count.Globally() _ = pcoll | 'largest-globally' >> core.CombineGlobally(combiners.Largest(1)) pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ translations.pack_combiners, ], known_runner_urns=frozenset(), partial=True) # Tests that Pipeline.from_runner_api() does not throw an exception. runner = runners.DirectRunner() beam.Pipeline.from_runner_api( optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_run_packable_combine_limit(self): class MultipleLargeCombines(beam.PTransform): def annotations(self): # Limit to at most 2 combiners per packed combiner. return {python_urns.APPLY_COMBINER_PACKING: b'2'} def expand(self, pcoll): assert_that( pcoll | 'min-1-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-1-globally') assert_that( pcoll | 'min-2-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-2-globally') assert_that( pcoll | 'min-3-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-3-globally') class MultipleSmallCombines(beam.PTransform): def annotations(self): # Limit to at most 4 combiners per packed combiner. return {python_urns.APPLY_COMBINER_PACKING: b'4'} def expand(self, pcoll): assert_that( pcoll | 'min-4-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-4-globally') assert_that( pcoll | 'min-5-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-5-globally') with TestPipeline() as pipeline: vals = [6, 3, 1, -1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | Create(vals) _ = pcoll | 'multiple-large-combines' >> MultipleLargeCombines() _ = pcoll | 'multiple-small-combines' >> MultipleSmallCombines() proto = pipeline.to_runner_api( default_environment=environments.EmbeddedPythonEnvironment( capabilities=environments.python_sdk_capabilities())) optimized = translations.optimize_pipeline( proto, phases=[translations.pack_combiners], known_runner_urns=frozenset(), partial=True) optimized_stage_names = [ t.unique_name for t in optimized.components.transforms.values() ] self.assertIn( 'multiple-large-combines/Packed[min-1-globally_CombinePerKey, ' 'min-2-globally_CombinePerKey]/Pack', optimized_stage_names) self.assertIn( 'Packed[multiple-large-combines_min-3-globally_CombinePerKey, ' 'multiple-small-combines_min-4-globally_CombinePerKey]/Pack', optimized_stage_names) self.assertIn( 'multiple-small-combines/min-5-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-1-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-2-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-3-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-small-combines/min-4-globally/CombinePerKey', optimized_stage_names)