def test_environment_encoding(self): for environment in (DockerEnvironment(), DockerEnvironment(container_image='img'), ProcessEnvironment('run.sh'), ProcessEnvironment('run.sh', os='linux', arch='amd64', env={'k1': 'v1'}), ExternalEnvironment('localhost:8080'), ExternalEnvironment('localhost:8080', params={'k1': 'v1'}), EmbeddedPythonEnvironment(), EmbeddedPythonGrpcEnvironment(), EmbeddedPythonGrpcEnvironment(num_workers=2, state_cache_size=0), SubprocessSDKEnvironment(command_string=u'foö')): context = pipeline_context.PipelineContext() self.assertEqual( environment, Environment.from_runner_api(environment.to_runner_api(context), context)) with self.assertRaises(ValueError) as ctx: EmbeddedPythonGrpcEnvironment(num_workers=2).to_runner_api( pipeline_context.PipelineContext()) self.assertIn('Must provide worker num and state cache size.', ctx.exception.args)
def test_default_capabilities(self): environment = DockerEnvironment.from_options( PortableOptions(sdk_location='container')) context = pipeline_context.PipelineContext() proto = environment.to_runner_api(context) self.assertEqual(set(proto.capabilities), set(environments.python_sdk_capabilities()))
def test_sdk_harness_container_image_overrides(self): test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = Pipeline().to_runner_api( return_context=True, default_environment=test_environment) pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, {'.*dummy.*': 'new_dummy_container_image'}, pipeline_options) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual( docker_payload.container_image, 'new_dummy_container_image')
def test_pipeline_sdk_not_overridden(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--sdk_container_image=dummy_prefix/dummy_name:dummy_tag' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env = DockerEnvironment( container_image='dummy_prefix/dummy_name:dummy_tag') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=dummy_env) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, {}, pipeline_options) self.assertIsNotNone(2, len(proto_pipeline.components.environments)) from apache_beam.utils import proto_utils found_override = False for env in proto_pipeline.components.environments.values(): docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) if docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): found_override = True self.assertFalse(found_override)
def test_translate_portable_job_step_name(self): mock_client, mock_job_result = self.setup_mock_client_result( self.ONLY_COUNTERS_LIST) pipeline_options = PipelineOptions([ '--experiments=use_runner_v2', '--experiments=use_portable_job_submission', '--temp_location=gs://any-location/temp', '--project=dummy_project', ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) job = apiclient.Job(pipeline_options, proto_pipeline) dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result, job) self.assertEqual( 'MyTestParDo', dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
def test_sdk_harness_container_images_get_set(self): if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__: _LOGGER.warning( 'Skipping test \'test_sdk_harness_container_images_get_set\' since ' 'Dataflow API WorkerPool does not have attribute ' '\'sdkHarnessContainerImages\'') return pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline) worker_pool = env.proto.workerPools[0] self.assertIsNotNone(1, len(worker_pool.sdkHarnessContainerImages)) # Container image should be overridden by a Dataflow specific URL. self.assertTrue( str.startswith( (worker_pool.sdkHarnessContainerImages[0]).containerImage, 'gcr.io/cloud-dataflow/v1beta3/python'))
def test_environment_encoding(self): for environment in (DockerEnvironment(), DockerEnvironment(container_image='img'), ProcessEnvironment('run.sh'), ProcessEnvironment('run.sh', os='linux', arch='amd64', env={'k1': 'v1'}), ExternalEnvironment('localhost:8080'), ExternalEnvironment('localhost:8080', params={'k1': 'v1'}), EmbeddedPythonEnvironment(), EmbeddedPythonGrpcEnvironment(), EmbeddedPythonGrpcEnvironment(state_cache_size=0), SubprocessSDKEnvironment(command_string=u'foö')): context = pipeline_context.PipelineContext() self.assertEqual( environment, Environment.from_runner_api(environment.to_runner_api(context), context))
def test_environment_encoding(self): for environment in (DockerEnvironment(), DockerEnvironment(container_image='img'), DockerEnvironment(capabilities=['x, y, z']), ProcessEnvironment('run.sh'), ProcessEnvironment('run.sh', os='linux', arch='amd64', env={'k1': 'v1'}), ExternalEnvironment('localhost:8080'), ExternalEnvironment('localhost:8080', params={'k1': 'v1'}), EmbeddedPythonEnvironment(), EmbeddedPythonGrpcEnvironment(), EmbeddedPythonGrpcEnvironment( state_cache_size=0, data_buffer_time_limit_ms=0), SubprocessSDKEnvironment(command_string=u'foö')): context = pipeline_context.PipelineContext() proto = environment.to_runner_api(context) reconstructed = Environment.from_runner_api(proto, context) self.assertEqual(environment, reconstructed) self.assertEqual(proto, reconstructed.to_runner_api(context))
def test_sdk_harness_container_images_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) # We have to manually add environments since Dataflow only sets # 'sdkHarnessContainerImages' when there are at least two environments. dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom( dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] # For the test, a third environment get added since actual default # container image for Dataflow is different from 'test_default_image' # we've provided above. self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages)) # Container image should be overridden by a Dataflow specific URL. self.assertTrue( str.startswith( (worker_pool.sdkHarnessContainerImages[0]).containerImage, 'gcr.io/cloud-dataflow/v1beta3/python'))
def test_default_environment_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment(container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages)) images_from_proto = [ sdk_info.containerImage for sdk_info in worker_pool.sdkHarnessContainerImages ] self.assertIn('test_default_image', images_from_proto)
def test_dataflow_container_image_override(self): test_environment = DockerEnvironment( container_image='apache/beam_java11_sdk:x.yz.0') proto_pipeline, _ = Pipeline().to_runner_api( return_context=True, default_environment=test_environment) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict()) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertTrue( docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY))
def test_sdk_harness_container_image_overrides(self): if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__: _LOGGER.warning( 'Skipping test \'test_sdk_harness_container_image_overrides\' since ' 'Dataflow API WorkerPool does not have attribute ' '\'sdkHarnessContainerImages\'') return pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--project', 'dummy_project', '--sdk_harness_container_image_overrides', '.*dummy.*,new_dummy_container_image', ]) pipeline = Pipeline(options=pipeline_options) test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dataflow_client = apiclient.DataflowApplicationClient(pipeline_options) # Accessing non-public method for testing. dataflow_client._apply_sdk_environment_overrides(proto_pipeline) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual(docker_payload.container_image, 'new_dummy_container_image')