示例#1
0
    def test_environment_encoding(self):
        for environment in (DockerEnvironment(),
                            DockerEnvironment(container_image='img'),
                            ProcessEnvironment('run.sh'),
                            ProcessEnvironment('run.sh',
                                               os='linux',
                                               arch='amd64',
                                               env={'k1': 'v1'}),
                            ExternalEnvironment('localhost:8080'),
                            ExternalEnvironment('localhost:8080',
                                                params={'k1': 'v1'}),
                            EmbeddedPythonEnvironment(),
                            EmbeddedPythonGrpcEnvironment(),
                            EmbeddedPythonGrpcEnvironment(num_workers=2,
                                                          state_cache_size=0),
                            SubprocessSDKEnvironment(command_string=u'foö')):
            context = pipeline_context.PipelineContext()
            self.assertEqual(
                environment,
                Environment.from_runner_api(environment.to_runner_api(context),
                                            context))

        with self.assertRaises(ValueError) as ctx:
            EmbeddedPythonGrpcEnvironment(num_workers=2).to_runner_api(
                pipeline_context.PipelineContext())
        self.assertIn('Must provide worker num and state cache size.',
                      ctx.exception.args)
示例#2
0
 def test_default_capabilities(self):
     environment = DockerEnvironment.from_options(
         PortableOptions(sdk_location='container'))
     context = pipeline_context.PipelineContext()
     proto = environment.to_runner_api(context)
     self.assertEqual(set(proto.capabilities),
                      set(environments.python_sdk_capabilities()))
示例#3
0
  def test_sdk_harness_container_image_overrides(self):
    test_environment = DockerEnvironment(
        container_image='dummy_container_image')
    proto_pipeline, _ = Pipeline().to_runner_api(
      return_context=True, default_environment=test_environment)

    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp'
    ])

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, {'.*dummy.*': 'new_dummy_container_image'},
        pipeline_options)

    self.assertIsNotNone(1, len(proto_pipeline.components.environments))
    env = list(proto_pipeline.components.environments.values())[0]

    from apache_beam.utils import proto_utils
    docker_payload = proto_utils.parse_Bytes(
        env.payload, beam_runner_api_pb2.DockerPayload)

    # Container image should be overridden by a the given override.
    self.assertEqual(
        docker_payload.container_image, 'new_dummy_container_image')
示例#4
0
  def test_pipeline_sdk_not_overridden(self):
    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp',
        '--sdk_container_image=dummy_prefix/dummy_name:dummy_tag'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

    dummy_env = DockerEnvironment(
        container_image='dummy_prefix/dummy_name:dummy_tag')
    proto_pipeline, _ = pipeline.to_runner_api(
        return_context=True, default_environment=dummy_env)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, {}, pipeline_options)

    self.assertIsNotNone(2, len(proto_pipeline.components.environments))

    from apache_beam.utils import proto_utils
    found_override = False
    for env in proto_pipeline.components.environments.values():
      docker_payload = proto_utils.parse_Bytes(
          env.payload, beam_runner_api_pb2.DockerPayload)
      if docker_payload.container_image.startswith(
          names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY):
        found_override = True

    self.assertFalse(found_override)
示例#5
0
    def test_translate_portable_job_step_name(self):
        mock_client, mock_job_result = self.setup_mock_client_result(
            self.ONLY_COUNTERS_LIST)

        pipeline_options = PipelineOptions([
            '--experiments=use_runner_v2',
            '--experiments=use_portable_job_submission',
            '--temp_location=gs://any-location/temp',
            '--project=dummy_project',
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        job = apiclient.Job(pipeline_options, proto_pipeline)
        dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result,
                                              job)
        self.assertEqual(
            'MyTestParDo',
            dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
示例#6
0
    def test_sdk_harness_container_images_get_set(self):
        if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__:
            _LOGGER.warning(
                'Skipping test \'test_sdk_harness_container_images_get_set\' since '
                'Dataflow API WorkerPool does not have attribute '
                '\'sdkHarnessContainerImages\'')
            return

        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp'
        ])

        pipeline = Pipeline(options=pipeline_options)

        test_environment = DockerEnvironment(
            container_image='dummy_container_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        env = apiclient.Environment(
            [],  # packages
            pipeline_options,
            '2.0.0',  # any environment version
            FAKE_PIPELINE_URL,
            proto_pipeline)
        worker_pool = env.proto.workerPools[0]
        self.assertIsNotNone(1, len(worker_pool.sdkHarnessContainerImages))

        # Container image should be overridden by a Dataflow specific URL.
        self.assertTrue(
            str.startswith(
                (worker_pool.sdkHarnessContainerImages[0]).containerImage,
                'gcr.io/cloud-dataflow/v1beta3/python'))
示例#7
0
 def test_environment_encoding(self):
     for environment in (DockerEnvironment(),
                         DockerEnvironment(container_image='img'),
                         ProcessEnvironment('run.sh'),
                         ProcessEnvironment('run.sh',
                                            os='linux',
                                            arch='amd64',
                                            env={'k1': 'v1'}),
                         ExternalEnvironment('localhost:8080'),
                         ExternalEnvironment('localhost:8080',
                                             params={'k1': 'v1'}),
                         EmbeddedPythonEnvironment(),
                         EmbeddedPythonGrpcEnvironment(),
                         EmbeddedPythonGrpcEnvironment(state_cache_size=0),
                         SubprocessSDKEnvironment(command_string=u'foö')):
         context = pipeline_context.PipelineContext()
         self.assertEqual(
             environment,
             Environment.from_runner_api(environment.to_runner_api(context),
                                         context))
 def test_environment_encoding(self):
   for environment in (DockerEnvironment(),
                       DockerEnvironment(container_image='img'),
                       DockerEnvironment(capabilities=['x, y, z']),
                       ProcessEnvironment('run.sh'),
                       ProcessEnvironment('run.sh',
                                          os='linux',
                                          arch='amd64',
                                          env={'k1': 'v1'}),
                       ExternalEnvironment('localhost:8080'),
                       ExternalEnvironment('localhost:8080',
                                           params={'k1': 'v1'}),
                       EmbeddedPythonEnvironment(),
                       EmbeddedPythonGrpcEnvironment(),
                       EmbeddedPythonGrpcEnvironment(
                           state_cache_size=0, data_buffer_time_limit_ms=0),
                       SubprocessSDKEnvironment(command_string=u'foö')):
     context = pipeline_context.PipelineContext()
     proto = environment.to_runner_api(context)
     reconstructed = Environment.from_runner_api(proto, context)
     self.assertEqual(environment, reconstructed)
     self.assertEqual(proto, reconstructed.to_runner_api(context))
示例#9
0
    def test_sdk_harness_container_images_get_set(self):

        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp'
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        # We have to manually add environments since Dataflow only sets
        # 'sdkHarnessContainerImages' when there are at least two environments.
        dummy_env = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id'].CopyFrom(
            dummy_env)

        dummy_transform = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id')
        proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
            dummy_transform)

        env = apiclient.Environment(
            [],  # packages
            pipeline_options,
            '2.0.0',  # any environment version
            FAKE_PIPELINE_URL,
            proto_pipeline,
            _sdk_image_overrides={
                '.*dummy.*': 'dummy_image',
                '.*test.*': 'test_default_image'
            })
        worker_pool = env.proto.workerPools[0]

        # For the test, a third environment get added since actual default
        # container image for Dataflow is different from 'test_default_image'
        # we've provided above.
        self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages))

        # Container image should be overridden by a Dataflow specific URL.
        self.assertTrue(
            str.startswith(
                (worker_pool.sdkHarnessContainerImages[0]).containerImage,
                'gcr.io/cloud-dataflow/v1beta3/python'))
示例#10
0
  def test_default_environment_get_set(self):

    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    test_environment = DockerEnvironment(container_image='test_default_image')
    proto_pipeline, _ = pipeline.to_runner_api(
        return_context=True, default_environment=test_environment)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    env = apiclient.Environment(
        [],  # packages
        pipeline_options,
        '2.0.0',  # any environment version
        FAKE_PIPELINE_URL,
        proto_pipeline,
        _sdk_image_overrides={
            '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image'
        })
    worker_pool = env.proto.workerPools[0]

    self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages))

    images_from_proto = [
        sdk_info.containerImage
        for sdk_info in worker_pool.sdkHarnessContainerImages
    ]
    self.assertIn('test_default_image', images_from_proto)
示例#11
0
    def test_dataflow_container_image_override(self):
        test_environment = DockerEnvironment(
            container_image='apache/beam_java11_sdk:x.yz.0')
        proto_pipeline, _ = Pipeline().to_runner_api(
            return_context=True, default_environment=test_environment)

        # Accessing non-public method for testing.
        apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
            proto_pipeline, dict())

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertTrue(
            docker_payload.container_image.startswith(
                names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY))
示例#12
0
    def test_sdk_harness_container_image_overrides(self):
        if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__:
            _LOGGER.warning(
                'Skipping test \'test_sdk_harness_container_image_overrides\' since '
                'Dataflow API WorkerPool does not have attribute '
                '\'sdkHarnessContainerImages\'')
            return
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api',
            '--experiments=use_unified_worker',
            '--temp_location',
            'gs://any-location/temp',
            '--project',
            'dummy_project',
            '--sdk_harness_container_image_overrides',
            '.*dummy.*,new_dummy_container_image',
        ])

        pipeline = Pipeline(options=pipeline_options)

        test_environment = DockerEnvironment(
            container_image='dummy_container_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)
        dataflow_client = apiclient.DataflowApplicationClient(pipeline_options)

        # Accessing non-public method for testing.
        dataflow_client._apply_sdk_environment_overrides(proto_pipeline)

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertEqual(docker_payload.container_image,
                         'new_dummy_container_image')