def matches(self, applied_ptransform): # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import util transform = applied_ptransform.transform if not isinstance(transform, util.GroupIntoBatches.WithShardedKey): return False # The replacement is only valid for portable Streaming Engine jobs with # runner v2. standard_options = self.options.view_as(StandardOptions) if not standard_options.streaming: return False google_cloud_options = self.options.view_as(GoogleCloudOptions) if not google_cloud_options.enable_streaming_engine: return False from apache_beam.runners.dataflow.internal import apiclient if not apiclient._use_unified_worker(self.options): return False experiments = self.options.view_as(DebugOptions).experiments or [] if 'enable_streaming_auto_sharding' not in experiments: return False self.dataflow_runner.add_pcoll_with_auto_sharding(applied_ptransform) return True
def matches(self, applied_ptransform): # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import util transform = applied_ptransform.transform if not isinstance(transform, util.GroupIntoBatches.WithShardedKey): return False # The replacement is only valid for portable Streaming Engine jobs with # runner v2. standard_options = self.options.view_as(StandardOptions) if not standard_options.streaming: return False google_cloud_options = self.options.view_as(GoogleCloudOptions) if not google_cloud_options.enable_streaming_engine: raise ValueError( 'Runner determined sharding not available in Dataflow for ' 'GroupIntoBatches for non-Streaming-Engine jobs. In order to use ' 'runner determined sharding, please use ' '--streaming --enable_streaming_engine --experiments=use_runner_v2' ) from apache_beam.runners.dataflow.internal import apiclient if not apiclient._use_unified_worker(self.options): raise ValueError( 'Runner determined sharding not available in Dataflow for ' 'GroupIntoBatches for jobs not using Runner V2. In order to use ' 'runner determined sharding, please use ' '--streaming --enable_streaming_engine --experiments=use_runner_v2' ) self.dataflow_runner.add_pcoll_with_auto_sharding(applied_ptransform) return True
def test_use_unified_worker(self): pipeline_options = PipelineOptions([]) self.assertFalse(apiclient._use_unified_worker(pipeline_options)) pipeline_options = PipelineOptions(['--experiments=beam_fn_api']) self.assertFalse(apiclient._use_unified_worker(pipeline_options)) pipeline_options = PipelineOptions(['--experiments=use_unified_worker']) self.assertTrue(apiclient._use_unified_worker(pipeline_options)) pipeline_options = PipelineOptions( ['--experiments=use_unified_worker', '--experiments=beam_fn_api']) self.assertTrue(apiclient._use_unified_worker(pipeline_options)) pipeline_options = PipelineOptions( ['--experiments=use_runner_v2', '--experiments=beam_fn_api']) self.assertTrue(apiclient._use_unified_worker(pipeline_options)) pipeline_options = PipelineOptions([ '--experiments=use_unified_worker', '--experiments=use_runner_v2', '--experiments=beam_fn_api' ]) self.assertTrue(apiclient._use_unified_worker(pipeline_options))