def test_default_capabilities(self): environment = DockerEnvironment.from_options( PortableOptions(sdk_location='container')) context = pipeline_context.PipelineContext() proto = environment.to_runner_api(context) self.assertEqual(set(proto.capabilities), set(environments.python_sdk_capabilities()))
def test_sdk_capabilities(self): sdk_capabilities = environments.python_sdk_capabilities() self.assertIn(common_urns.coders.LENGTH_PREFIX.urn, sdk_capabilities) self.assertIn(common_urns.protocols.WORKER_STATUS.urn, sdk_capabilities) self.assertIn( common_urns.sdf_components.TRUNCATE_SIZED_RESTRICTION.urn, sdk_capabilities)
def test_conditionally_packed_combiners(self): class RecursiveCombine(beam.PTransform): def __init__(self, labels): self._labels = labels def expand(self, pcoll): base = pcoll | 'Sum' >> beam.CombineGlobally(sum) if self._labels: rest = pcoll | self._labels[0] >> RecursiveCombine( self._labels[1:]) return (base, rest) | beam.Flatten() else: return base def annotations(self): if len(self._labels) == 2: return {python_urns.APPLY_COMBINER_PACKING: b''} else: return {} # Verify the results are as expected. with TestPipeline() as pipeline: result = pipeline | beam.Create([1, 2, 3 ]) | RecursiveCombine('ABCD') assert_that(result, equal_to([6, 6, 6, 6, 6])) # Verify the optimization is as expected. proto = pipeline.to_runner_api( default_environment=environments.EmbeddedPythonEnvironment( capabilities=environments.python_sdk_capabilities())) optimized = translations.optimize_pipeline( proto, phases=[translations.pack_combiners], known_runner_urns=frozenset(), partial=True) optimized_stage_names = sorted( t.unique_name for t in optimized.components.transforms.values()) self.assertIn('RecursiveCombine/Sum/CombinePerKey', optimized_stage_names) self.assertIn('RecursiveCombine/A/Sum/CombinePerKey', optimized_stage_names) self.assertNotIn('RecursiveCombine/A/B/Sum/CombinePerKey', optimized_stage_names) self.assertIn( 'RecursiveCombine/A/B/Packed[Sum_CombinePerKey, ' 'C_Sum_CombinePerKey, C_D_Sum_CombinePerKey]/Pack', optimized_stage_names)
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() with Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) as p: ( # pylint: disable=expression-not-assigned p | ptransform.Create([1, 2, 3]) | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [ beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString(), capabilities=environments.python_sdk_capabilities()) ])
def test_run_packable_combine_limit(self): class MultipleLargeCombines(beam.PTransform): def annotations(self): # Limit to at most 2 combiners per packed combiner. return {python_urns.APPLY_COMBINER_PACKING: b'2'} def expand(self, pcoll): assert_that( pcoll | 'min-1-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-1-globally') assert_that( pcoll | 'min-2-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-2-globally') assert_that( pcoll | 'min-3-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-3-globally') class MultipleSmallCombines(beam.PTransform): def annotations(self): # Limit to at most 4 combiners per packed combiner. return {python_urns.APPLY_COMBINER_PACKING: b'4'} def expand(self, pcoll): assert_that( pcoll | 'min-4-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-4-globally') assert_that( pcoll | 'min-5-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-5-globally') with TestPipeline() as pipeline: vals = [6, 3, 1, -1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | Create(vals) _ = pcoll | 'multiple-large-combines' >> MultipleLargeCombines() _ = pcoll | 'multiple-small-combines' >> MultipleSmallCombines() proto = pipeline.to_runner_api( default_environment=environments.EmbeddedPythonEnvironment( capabilities=environments.python_sdk_capabilities())) optimized = translations.optimize_pipeline( proto, phases=[translations.pack_combiners], known_runner_urns=frozenset(), partial=True) optimized_stage_names = [ t.unique_name for t in optimized.components.transforms.values() ] self.assertIn( 'multiple-large-combines/Packed[min-1-globally_CombinePerKey, ' 'min-2-globally_CombinePerKey]/Pack', optimized_stage_names) self.assertIn( 'Packed[multiple-large-combines_min-3-globally_CombinePerKey, ' 'multiple-small-combines_min-4-globally_CombinePerKey]/Pack', optimized_stage_names) self.assertIn( 'multiple-small-combines/min-5-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-1-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-2-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-3-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-small-combines/min-4-globally/CombinePerKey', optimized_stage_names)
def test_sdk_capabilities(self): sdk_capabilities = environments.python_sdk_capabilities() self.assertIn(common_urns.coders.LENGTH_PREFIX.urn, sdk_capabilities) self.assertIn(common_urns.protocols.WORKER_STATUS.urn, sdk_capabilities)