def __init__( self, default_environment=None, # type: Optional[environments.Environment] bundle_repeat=0, use_state_iterables=False, provision_info=None, # type: Optional[ExtendedProvisionInfo] progress_request_frequency=None): # type: (...) -> None """Creates a new Fn API Runner. Args: default_environment: the default environment to use for UserFns. bundle_repeat: replay every bundle this many extra times, for profiling and debugging use_state_iterables: Intentionally split gbk iterables over state API (for testing) provision_info: provisioning info to make available to workers, or None progress_request_frequency: The frequency (in seconds) that the runner waits before requesting progress from the SDK. """ super(FnApiRunner, self).__init__() self._default_environment = (default_environment or environments.EmbeddedPythonEnvironment()) self._bundle_repeat = bundle_repeat self._num_workers = 1 self._progress_frequency = progress_request_frequency self._profiler_factory = None # type: Optional[Callable[..., profiler.Profile]] self._use_state_iterables = use_state_iterables self._provision_info = provision_info or ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( retrieval_token='unused-retrieval-token'))
def test_conditionally_packed_combiners(self): class RecursiveCombine(beam.PTransform): def __init__(self, labels): self._labels = labels def expand(self, pcoll): base = pcoll | 'Sum' >> beam.CombineGlobally(sum) if self._labels: rest = pcoll | self._labels[0] >> RecursiveCombine( self._labels[1:]) return (base, rest) | beam.Flatten() else: return base def annotations(self): if len(self._labels) == 2: return {python_urns.APPLY_COMBINER_PACKING: b''} else: return {} # Verify the results are as expected. with TestPipeline() as pipeline: result = pipeline | beam.Create([1, 2, 3 ]) | RecursiveCombine('ABCD') assert_that(result, equal_to([6, 6, 6, 6, 6])) # Verify the optimization is as expected. proto = pipeline.to_runner_api( default_environment=environments.EmbeddedPythonEnvironment( capabilities=environments.python_sdk_capabilities())) optimized = translations.optimize_pipeline( proto, phases=[translations.pack_combiners], known_runner_urns=frozenset(), partial=True) optimized_stage_names = sorted( t.unique_name for t in optimized.components.transforms.values()) self.assertIn('RecursiveCombine/Sum/CombinePerKey', optimized_stage_names) self.assertIn('RecursiveCombine/A/Sum/CombinePerKey', optimized_stage_names) self.assertNotIn('RecursiveCombine/A/B/Sum/CombinePerKey', optimized_stage_names) self.assertIn( 'RecursiveCombine/A/B/Packed[Sum_CombinePerKey, ' 'C_Sum_CombinePerKey, C_D_Sum_CombinePerKey]/Pack', optimized_stage_names)
def test_run_packable_combine_limit(self): class MultipleLargeCombines(beam.PTransform): def annotations(self): # Limit to at most 2 combiners per packed combiner. return {python_urns.APPLY_COMBINER_PACKING: b'2'} def expand(self, pcoll): assert_that( pcoll | 'min-1-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-1-globally') assert_that( pcoll | 'min-2-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-2-globally') assert_that( pcoll | 'min-3-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-3-globally') class MultipleSmallCombines(beam.PTransform): def annotations(self): # Limit to at most 4 combiners per packed combiner. return {python_urns.APPLY_COMBINER_PACKING: b'4'} def expand(self, pcoll): assert_that( pcoll | 'min-4-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-4-globally') assert_that( pcoll | 'min-5-globally' >> core.CombineGlobally(min), equal_to([-1]), label='assert-min-5-globally') with TestPipeline() as pipeline: vals = [6, 3, 1, -1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | Create(vals) _ = pcoll | 'multiple-large-combines' >> MultipleLargeCombines() _ = pcoll | 'multiple-small-combines' >> MultipleSmallCombines() proto = pipeline.to_runner_api( default_environment=environments.EmbeddedPythonEnvironment( capabilities=environments.python_sdk_capabilities())) optimized = translations.optimize_pipeline( proto, phases=[translations.pack_combiners], known_runner_urns=frozenset(), partial=True) optimized_stage_names = [ t.unique_name for t in optimized.components.transforms.values() ] self.assertIn( 'multiple-large-combines/Packed[min-1-globally_CombinePerKey, ' 'min-2-globally_CombinePerKey]/Pack', optimized_stage_names) self.assertIn( 'Packed[multiple-large-combines_min-3-globally_CombinePerKey, ' 'multiple-small-combines_min-4-globally_CombinePerKey]/Pack', optimized_stage_names) self.assertIn( 'multiple-small-combines/min-5-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-1-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-2-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-large-combines/min-3-globally/CombinePerKey', optimized_stage_names) self.assertNotIn( 'multiple-small-combines/min-4-globally/CombinePerKey', optimized_stage_names)