示例#1
0
def read_to_impulse(stages, pipeline_context):
  """Translates Read operations into Impulse operations."""
  for stage in stages:
    # First map Reads, if any, to Impulse + triggered read op.
    for transform in list(stage.transforms):
      if transform.spec.urn == common_urns.deprecated_primitives.READ.urn:
        read_pc = only_element(transform.outputs.values())
        read_pc_proto = pipeline_context.components.pcollections[read_pc]
        impulse_pc = unique_name(
            pipeline_context.components.pcollections, 'Impulse')
        pipeline_context.components.pcollections[impulse_pc].CopyFrom(
            beam_runner_api_pb2.PCollection(
                unique_name=impulse_pc,
                coder_id=pipeline_context.bytes_coder_id,
                windowing_strategy_id=read_pc_proto.windowing_strategy_id,
                is_bounded=read_pc_proto.is_bounded))
        stage.transforms.remove(transform)
        # TODO(robertwb): If this goes multi-process before fn-api
        # read is default, expand into split + reshuffle + read.
        stage.transforms.append(
            beam_runner_api_pb2.PTransform(
                unique_name=transform.unique_name + '/Impulse',
                spec=beam_runner_api_pb2.FunctionSpec(
                    urn=common_urns.primitives.IMPULSE.urn),
                outputs={'out': impulse_pc}))
        stage.transforms.append(
            beam_runner_api_pb2.PTransform(
                unique_name=transform.unique_name,
                spec=beam_runner_api_pb2.FunctionSpec(
                    urn=python_urns.IMPULSE_READ_TRANSFORM,
                    payload=transform.spec.payload),
                inputs={'in': impulse_pc},
                outputs={'out': read_pc}))

    yield stage
示例#2
0
  def expand(self, pvalueish):
    # type: (pvalue.PCollection) -> pvalue.PCollection
    if isinstance(pvalueish, pvalue.PBegin):
      self._inputs = {}
    elif isinstance(pvalueish, (list, tuple)):
      self._inputs = {str(ix): pvalue for ix, pvalue in enumerate(pvalueish)}
    elif isinstance(pvalueish, dict):
      self._inputs = pvalueish
    else:
      self._inputs = {'input': pvalueish}
    pipeline = (
        next(iter(self._inputs.values())).pipeline
        if self._inputs
        else pvalueish.pipeline)
    context = pipeline_context.PipelineContext()
    transform_proto = beam_runner_api_pb2.PTransform(
        unique_name=pipeline._current_transform().full_label,
        spec=beam_runner_api_pb2.FunctionSpec(
            urn=self._urn, payload=self._payload))
    for tag, pcoll in self._inputs.items():
      transform_proto.inputs[tag] = context.pcollections.get_id(pcoll)
      # Conversion to/from proto assumes producers.
      # TODO: Possibly loosen this.
      context.transforms.put_proto(
          '%s_%s' % (self._IMPULSE_PREFIX, tag),
          beam_runner_api_pb2.PTransform(
              unique_name='%s_%s' % (self._IMPULSE_PREFIX, tag),
              spec=beam_runner_api_pb2.FunctionSpec(
                  urn=common_urns.primitives.IMPULSE.urn),
              outputs={'out': transform_proto.inputs[tag]}))
    components = context.to_runner_api()
    request = beam_expansion_api_pb2.ExpansionRequest(
        components=components,
        namespace=self._namespace,  # type: ignore  # mypy thinks self._namespace is threading.local
        transform=transform_proto)

    if isinstance(self._expansion_service, str):
      with grpc.insecure_channel(self._expansion_service) as channel:
        response = beam_expansion_api_pb2_grpc.ExpansionServiceStub(
            channel).Expand(request)
    else:
      response = self._expansion_service.Expand(request, None)

    if response.error:
      raise RuntimeError(response.error)
    self._expanded_components = response.components
    self._expanded_transform = response.transform
    result_context = pipeline_context.PipelineContext(response.components)

    def fix_output(pcoll, tag):
      pcoll.pipeline = pipeline
      pcoll.tag = tag
      return pcoll
    self._outputs = {
        tag: fix_output(result_context.pcollections.get_by_id(pcoll_id), tag)
        for tag, pcoll_id in self._expanded_transform.outputs.items()
    }

    return self._output_to_pvalueish(self._outputs)
示例#3
0
    def test_java_sdk_harness_dedup(self):
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp'
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

        proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

        dummy_env_1 = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='apache/beam_java:dummy_tag')
                     ).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id_1'].CopyFrom(
            dummy_env_1)

        dummy_transform_1 = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id_1')
        proto_pipeline.components.transforms['dummy_transform_id_1'].CopyFrom(
            dummy_transform_1)

        dummy_env_2 = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='apache/beam_java:dummy_tag')
                     ).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id_2'].CopyFrom(
            dummy_env_2)

        dummy_transform_2 = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id_2')
        proto_pipeline.components.transforms['dummy_transform_id_2'].CopyFrom(
            dummy_transform_2)

        # Accessing non-public method for testing.
        apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
            proto_pipeline, dict(), pipeline_options)

        # Only one of 'dummy_env_id_1' or 'dummy_env_id_2' should be in the set of
        # environment IDs used by the proto after Java environment de-duping.
        env_ids_from_transforms = [
            proto_pipeline.components.transforms[transform_id].environment_id
            for transform_id in proto_pipeline.components.transforms
        ]
        if 'dummy_env_id_1' in env_ids_from_transforms:
            self.assertTrue('dummy_env_id_2' not in env_ids_from_transforms)
        else:
            self.assertTrue('dummy_env_id_2' in env_ids_from_transforms)
def expand_gbk(stages, pipeline_context):
    """Transforms each GBK into a write followed by a read.
  """
    for stage in stages:
        assert len(stage.transforms) == 1
        transform = stage.transforms[0]
        if transform.spec.urn == common_urns.primitives.GROUP_BY_KEY.urn:
            for pcoll_id in transform.inputs.values():
                pipeline_context.length_prefix_pcoll_coders(pcoll_id)
            for pcoll_id in transform.outputs.values():
                if pipeline_context.use_state_iterables:
                    pipeline_context.components.pcollections[
                        pcoll_id].coder_id = pipeline_context.with_state_iterables(
                            pipeline_context.components.pcollections[pcoll_id].
                            coder_id)
                pipeline_context.length_prefix_pcoll_coders(pcoll_id)

            # This is used later to correlate the read and write.
            transform_id = stage.name
            if transform != pipeline_context.components.transforms.get(
                    transform_id):
                transform_id = unique_name(
                    pipeline_context.components.transforms, stage.name)
                pipeline_context.components.transforms[transform_id].CopyFrom(
                    transform)
            grouping_buffer = create_buffer_id(transform_id, kind='group')
            gbk_write = Stage(transform.unique_name + '/Write', [
                beam_runner_api_pb2.PTransform(
                    unique_name=transform.unique_name + '/Write',
                    inputs=transform.inputs,
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=bundle_processor.DATA_OUTPUT_URN,
                        payload=grouping_buffer))
            ],
                              downstream_side_inputs=frozenset(),
                              must_follow=stage.must_follow)
            yield gbk_write

            yield Stage(transform.unique_name + '/Read', [
                beam_runner_api_pb2.PTransform(
                    unique_name=transform.unique_name + '/Read',
                    outputs=transform.outputs,
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=bundle_processor.DATA_INPUT_URN,
                        payload=grouping_buffer))
            ],
                        downstream_side_inputs=stage.downstream_side_inputs,
                        must_follow=union(frozenset([gbk_write]),
                                          stage.must_follow))
        else:
            yield stage
示例#5
0
    def to_runner_api(self, context):
        from apache_beam.portability.api import beam_runner_api_pb2

        def transform_to_runner_api(transform, context):
            if transform is None:
                return None
            else:
                return transform.to_runner_api(context)

        return beam_runner_api_pb2.PTransform(
            unique_name=self.full_label,
            spec=transform_to_runner_api(self.transform, context),
            subtransforms=[
                context.transforms.get_id(part, label=part.full_label)
                for part in self.parts
            ],
            # TODO(BEAM-115): Side inputs.
            inputs={
                tag: context.pcollections.get_id(pc)
                for tag, pc in self.named_inputs().items()
            },
            outputs={
                str(tag): context.pcollections.get_id(out)
                for tag, out in self.named_outputs().items()
            },
            # TODO(BEAM-115): display_data
            display_data=None)
示例#6
0
    def to_runner_api(self, context):
        # External tranforms require more splicing than just setting the spec.
        from apache_beam.transforms import external
        if isinstance(self.transform, external.ExternalTransform):
            return self.transform.to_runner_api_transform(
                context, self.full_label)

        from apache_beam.portability.api import beam_runner_api_pb2

        def transform_to_runner_api(transform, context):
            if transform is None:
                return None
            else:
                return transform.to_runner_api(context,
                                               has_parts=bool(self.parts))

        # Iterate over inputs and outputs by sorted key order, so that ids are
        # consistently generated for multiple runs of the same pipeline.
        return beam_runner_api_pb2.PTransform(
            unique_name=self.full_label,
            spec=transform_to_runner_api(self.transform, context),
            subtransforms=[
                context.transforms.get_id(part, label=part.full_label)
                for part in self.parts
            ],
            inputs={
                tag: context.pcollections.get_id(pc)
                for tag, pc in sorted(self.named_inputs().items())
            },
            outputs={
                str(tag): context.pcollections.get_id(out)
                for tag, out in sorted(self.named_outputs().items())
            },
            # TODO(BEAM-115): display_data
            display_data=None)
示例#7
0
    def test_runner_api_transformation_with_subscription(
            self, unused_mock_pubsub):
        source = _PubSubSource(
            topic=None,
            subscription='projects/fakeprj/subscriptions/a_subscription',
            id_label='a_label',
            timestamp_attribute='b_label',
            with_attributes=True)
        transform = Read(source)

        context = pipeline_context.PipelineContext()
        proto_transform_spec = transform.to_runner_api(context)
        self.assertEqual(common_urns.composites.PUBSUB_READ.urn,
                         proto_transform_spec.urn)

        pubsub_read_payload = (proto_utils.parse_Bytes(
            proto_transform_spec.payload,
            beam_runner_api_pb2.PubSubReadPayload))
        self.assertEqual('projects/fakeprj/subscriptions/a_subscription',
                         pubsub_read_payload.subscription)
        self.assertEqual('a_label', pubsub_read_payload.id_attribute)
        self.assertEqual('b_label', pubsub_read_payload.timestamp_attribute)
        self.assertEqual('', pubsub_read_payload.topic)
        self.assertTrue(pubsub_read_payload.with_attributes)

        proto_transform = beam_runner_api_pb2.PTransform(
            unique_name="dummy_label", spec=proto_transform_spec)

        transform_from_proto = Read.from_runner_api_parameter(
            proto_transform, pubsub_read_payload, None)
        self.assertTrue(isinstance(transform_from_proto, Read))
        self.assertTrue(isinstance(transform_from_proto.source, _PubSubSource))
        self.assertTrue(transform_from_proto.source.with_attributes)
        self.assertEqual('projects/fakeprj/subscriptions/a_subscription',
                         transform_from_proto.source.full_subscription)
示例#8
0
    def test_runner_api_transformation_properties_none(self,
                                                       unused_mock_pubsub):
        # Confirming that properties stay None after a runner API transformation.
        sink = _PubSubSink(
            topic='projects/fakeprj/topics/a_topic',
            id_label=None,
            with_attributes=True,
            # We expect encoded PubSub write transform to always return attributes.
            timestamp_attribute=None)
        transform = Write(sink)

        context = pipeline_context.PipelineContext()
        proto_transform_spec = transform.to_runner_api(context)
        self.assertEqual(common_urns.composites.PUBSUB_WRITE.urn,
                         proto_transform_spec.urn)

        pubsub_write_payload = (proto_utils.parse_Bytes(
            proto_transform_spec.payload,
            beam_runner_api_pb2.PubSubWritePayload))
        proto_transform = beam_runner_api_pb2.PTransform(
            unique_name="dummy_label", spec=proto_transform_spec)
        transform_from_proto = Write.from_runner_api_parameter(
            proto_transform, pubsub_write_payload, None)

        self.assertTrue(isinstance(transform_from_proto, Write))
        self.assertTrue(isinstance(transform_from_proto.sink, _PubSubSink))
        self.assertTrue(transform_from_proto.sink.with_attributes)
        self.assertIsNone(transform_from_proto.sink.id_label)
        self.assertIsNone(transform_from_proto.sink.timestamp_attribute)
示例#9
0
    def test_fn_registration(self):
        process_bundle_descriptors = [
            beam_fn_api_pb2.ProcessBundleDescriptor(
                id=str(100 + ix),
                transforms={
                    str(ix):
                    beam_runner_api_pb2.PTransform(unique_name=str(ix))
                }) for ix in range(4)
        ]

        test_controller = BeamFnControlServicer([
            beam_fn_api_pb2.InstructionRequest(
                register=beam_fn_api_pb2.RegisterRequest(
                    process_bundle_descriptor=process_bundle_descriptors))
        ])

        server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
        beam_fn_api_pb2.add_BeamFnControlServicer_to_server(
            test_controller, server)
        test_port = server.add_insecure_port("[::]:0")
        server.start()

        channel = grpc.insecure_channel("localhost:%s" % test_port)
        harness = sdk_worker.SdkHarness(channel)
        harness.run()
        self.assertEqual(
            harness.worker.fns,
            {item.id: item
             for item in process_bundle_descriptors})
示例#10
0
 def _get_process_bundles(self, prefix, size):
   return [
       beam_fn_api_pb2.ProcessBundleDescriptor(
           id=str(str(prefix) + "-" + str(ix)),
           transforms={
               str(ix): beam_runner_api_pb2.PTransform(unique_name=str(ix))
           }) for ix in range(size)
   ]
示例#11
0
def sink_flattens(stages, pipeline_context):
  """Sink flattens and remove them from the graph.

  A flatten that cannot be sunk/fused away becomes multiple writes (to the
  same logical sink) followed by a read.
  """
  # TODO(robertwb): Actually attempt to sink rather than always materialize.
  # TODO(robertwb): Possibly fuse this into one of the stages.
  for stage in fix_flatten_coders(stages, pipeline_context):
    transform = only_element(stage.transforms)
    if transform.spec.urn == common_urns.primitives.FLATTEN.urn:
      # This is used later to correlate the read and writes.
      buffer_id = create_buffer_id(transform.unique_name)
      flatten_writes = []
      for local_in, pcoll_in in transform.inputs.items():
        flatten_write = Stage(
            transform.unique_name + '/Write/' + local_in,
            [beam_runner_api_pb2.PTransform(
                unique_name=transform.unique_name + '/Write/' + local_in,
                inputs={local_in: pcoll_in},
                spec=beam_runner_api_pb2.FunctionSpec(
                    urn=bundle_processor.DATA_OUTPUT_URN,
                    payload=buffer_id))],
            downstream_side_inputs=frozenset(),
            must_follow=stage.must_follow)
        flatten_writes.append(flatten_write)
        yield flatten_write

      yield Stage(
          transform.unique_name + '/Read',
          [beam_runner_api_pb2.PTransform(
              unique_name=transform.unique_name + '/Read',
              outputs=transform.outputs,
              spec=beam_runner_api_pb2.FunctionSpec(
                  urn=bundle_processor.DATA_INPUT_URN,
                  payload=buffer_id))],
          downstream_side_inputs=stage.downstream_side_inputs,
          must_follow=union(frozenset(flatten_writes), stage.must_follow))

    else:
      yield stage
示例#12
0
    def test_sdk_harness_container_images_get_set(self):

        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp'
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        # We have to manually add environments since Dataflow only sets
        # 'sdkHarnessContainerImages' when there are at least two environments.
        dummy_env = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id'].CopyFrom(
            dummy_env)

        dummy_transform = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id')
        proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
            dummy_transform)

        env = apiclient.Environment(
            [],  # packages
            pipeline_options,
            '2.0.0',  # any environment version
            FAKE_PIPELINE_URL,
            proto_pipeline,
            _sdk_image_overrides={
                '.*dummy.*': 'dummy_image',
                '.*test.*': 'test_default_image'
            })
        worker_pool = env.proto.workerPools[0]

        # For the test, a third environment get added since actual default
        # container image for Dataflow is different from 'test_default_image'
        # we've provided above.
        self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages))

        # Container image should be overridden by a Dataflow specific URL.
        self.assertTrue(
            str.startswith(
                (worker_pool.sdkHarnessContainerImages[0]).containerImage,
                'gcr.io/cloud-dataflow/v1beta3/python'))
示例#13
0
        def expand_gbk(stages):
            """Transforms each GBK into a write followed by a read.
      """
            for stage in stages:
                assert len(stage.transforms) == 1
                transform = stage.transforms[0]
                if transform.spec.urn == urns.GROUP_BY_KEY_ONLY_TRANSFORM:
                    # This is used later to correlate the read and write.
                    param = str("group:%s" % stage.name)
                    gbk_write = Stage(transform.unique_name + '/Write', [
                        beam_runner_api_pb2.PTransform(
                            unique_name=transform.unique_name + '/Write',
                            inputs=transform.inputs,
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_OUTPUT_URN,
                                any_param=proto_utils.pack_Any(
                                    wrappers_pb2.BytesValue(value=param)),
                                payload=param))
                    ],
                                      downstream_side_inputs=frozenset(),
                                      must_follow=stage.must_follow)
                    yield gbk_write

                    yield Stage(transform.unique_name + '/Read', [
                        beam_runner_api_pb2.PTransform(
                            unique_name=transform.unique_name + '/Read',
                            outputs=transform.outputs,
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_INPUT_URN,
                                any_param=proto_utils.pack_Any(
                                    wrappers_pb2.BytesValue(value=param)),
                                payload=param))
                    ],
                                downstream_side_inputs=frozenset(),
                                must_follow=union(frozenset([gbk_write]),
                                                  stage.must_follow))
                else:
                    yield stage
def impulse_to_input(stages, pipeline_context):
    """Translates Impulse operations into GRPC reads."""
    for stage in stages:
        for transform in list(stage.transforms):
            if transform.spec.urn == common_urns.primitives.IMPULSE.urn:
                stage.transforms.remove(transform)
                stage.transforms.append(
                    beam_runner_api_pb2.PTransform(
                        unique_name=transform.unique_name,
                        spec=beam_runner_api_pb2.FunctionSpec(
                            urn=bundle_processor.DATA_INPUT_URN,
                            payload=IMPULSE_BUFFER),
                        outputs=transform.outputs))
        yield stage
示例#15
0
文件: pipeline.py 项目: scosenza/beam
    def to_runner_api(self, context):
        # type: (PipelineContext) -> beam_runner_api_pb2.PTransform
        # External tranforms require more splicing than just setting the spec.
        from apache_beam.transforms import external
        if isinstance(self.transform, external.ExternalTransform):
            return self.transform.to_runner_api_transform(
                context, self.full_label)

        from apache_beam.portability.api import beam_runner_api_pb2

        def transform_to_runner_api(
            transform,  # type: Optional[ptransform.PTransform]
            context  # type: PipelineContext
        ):
            # type: (...) -> Optional[beam_runner_api_pb2.FunctionSpec]
            if transform is None:
                return None
            else:
                return transform.to_runner_api(context,
                                               has_parts=bool(self.parts))

        # Iterate over inputs and outputs by sorted key order, so that ids are
        # consistently generated for multiple runs of the same pipeline.
        transform_spec = transform_to_runner_api(self.transform, context)
        environment_id = self.environment_id
        transform_urn = transform_spec.urn if transform_spec else None
        if (not environment_id and transform_urn and
            (transform_urn in Pipeline.sdk_transforms_with_environment())):
            environment_id = context.default_environment_id()

        return beam_runner_api_pb2.PTransform(
            unique_name=self.full_label,
            spec=transform_spec,
            subtransforms=[
                context.transforms.get_id(part, label=part.full_label)
                for part in self.parts
            ],
            inputs={
                tag: context.pcollections.get_id(pc)
                for tag, pc in sorted(self.named_inputs().items())
            },
            outputs={
                str(tag): context.pcollections.get_id(out)
                for tag, out in sorted(self.named_outputs().items())
            },
            environment_id=environment_id,
            # TODO(BEAM-366): Add display_data.
            display_data=None)
示例#16
0
  def test_default_environment_get_set(self):

    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    test_environment = DockerEnvironment(container_image='test_default_image')
    proto_pipeline, _ = pipeline.to_runner_api(
        return_context=True, default_environment=test_environment)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    env = apiclient.Environment(
        [],  # packages
        pipeline_options,
        '2.0.0',  # any environment version
        FAKE_PIPELINE_URL,
        proto_pipeline,
        _sdk_image_overrides={
            '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image'
        })
    worker_pool = env.proto.workerPools[0]

    self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages))

    images_from_proto = [
        sdk_info.containerImage
        for sdk_info in worker_pool.sdkHarnessContainerImages
    ]
    self.assertIn('test_default_image', images_from_proto)
示例#17
0
  def test_pipeline_sdk_not_overridden(self):
    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp',
        '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_prefix/dummy_name:dummy_tag')
        ).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, dict(), pipeline_options)

    self.assertIsNotNone(2, len(proto_pipeline.components.environments))

    from apache_beam.utils import proto_utils
    found_override = False
    for env in proto_pipeline.components.environments.values():
      docker_payload = proto_utils.parse_Bytes(
          env.payload, beam_runner_api_pb2.DockerPayload)
      if docker_payload.container_image.startswith(
          names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY):
        found_override = True

    self.assertFalse(found_override)
示例#18
0
def fix_flatten_coders(stages, pipeline_context):
  """Ensures that the inputs of Flatten have the same coders as the output.
  """
  pcollections = pipeline_context.components.pcollections
  for stage in stages:
    transform = only_element(stage.transforms)
    if transform.spec.urn == common_urns.primitives.FLATTEN.urn:
      output_pcoll_id = only_element(transform.outputs.values())
      output_coder_id = pcollections[output_pcoll_id].coder_id
      for local_in, pcoll_in in list(transform.inputs.items()):
        if pcollections[pcoll_in].coder_id != output_coder_id:
          # Flatten requires that all its inputs be materialized with the
          # same coder as its output.  Add stages to transcode flatten
          # inputs that use different coders.
          transcoded_pcollection = unique_name(
              pcollections,
              transform.unique_name + '/Transcode/' + local_in + '/out')
          transcode_name = unique_name(
              pipeline_context.components.transforms,
              transform.unique_name + '/Transcode/' + local_in)
          yield Stage(
              transcode_name,
              [beam_runner_api_pb2.PTransform(
                  unique_name=transcode_name,
                  inputs={local_in: pcoll_in},
                  outputs={'out': transcoded_pcollection},
                  spec=beam_runner_api_pb2.FunctionSpec(
                      urn=bundle_processor.IDENTITY_DOFN_URN))],
              downstream_side_inputs=frozenset(),
              must_follow=stage.must_follow)
          pcollections[transcoded_pcollection].CopyFrom(
              pcollections[pcoll_in])
          pcollections[transcoded_pcollection].unique_name = (
              transcoded_pcollection)
          pcollections[transcoded_pcollection].coder_id = output_coder_id
          transform.inputs[local_in] = transcoded_pcollection

    yield stage
示例#19
0
 def _producing_transforms(pcoll_id, leaf=False):
   """Returns PTransforms (and their names) that produces the given PColl."""
   derivation = pipeline_info.derivation(pcoll_id)
   if self._cache_manager.exists('full', derivation.cache_label()):
     if not leaf:
       caches_used.add(pcoll_id)
       yield ('Read' + derivation.cache_label(),
              beam_runner_api_pb2.PTransform(
                  unique_name='Read' + derivation.cache_label(),
                  spec=beam.io.Read(
                      beam.io.ReadFromText(
                          self._cache_manager.glob_path(
                              'full', derivation.cache_label()),
                          coder=SafeFastPrimitivesCoder())._source)
                  .to_runner_api(context),
                  outputs={'None': pcoll_id}))
   else:
     transform_id, _ = pipeline_info.producer(pcoll_id)
     transform_proto = pipeline_proto.components.transforms[transform_id]
     for input_id in transform_proto.inputs.values():
       for transform in _producing_transforms(input_id):
         yield transform
     yield transform_id, transform_proto
示例#20
0
    def test_runner_api_transformation_properties_none(self,
                                                       unused_mock_pubsub):
        # Confirming that properties stay None after a runner API transformation.
        source = _PubSubSource(topic='projects/fakeprj/topics/a_topic',
                               with_attributes=True)
        transform = Read(source)

        context = pipeline_context.PipelineContext()
        proto_transform_spec = transform.to_runner_api(context)
        self.assertEqual(common_urns.composites.PUBSUB_READ.urn,
                         proto_transform_spec.urn)

        pubsub_read_payload = (proto_utils.parse_Bytes(
            proto_transform_spec.payload,
            beam_runner_api_pb2.PubSubReadPayload))

        proto_transform = beam_runner_api_pb2.PTransform(
            unique_name="dummy_label", spec=proto_transform_spec)

        transform_from_proto = Read.from_runner_api_parameter(
            proto_transform, pubsub_read_payload, None)
        self.assertIsNone(transform_from_proto.source.full_subscription)
        self.assertIsNone(transform_from_proto.source.id_label)
        self.assertIsNone(transform_from_proto.source.timestamp_attribute)
示例#21
0
        def expand_gbk(stages):
            """Transforms each GBK into a write followed by a read.
      """
            good_coder_urns = set(beam.coders.Coder._known_urns.keys()) - set(
                [urns.PICKLED_CODER])
            coders = pipeline_components.coders

            for coder_id, coder_proto in coders.items():
                if coder_proto.spec.spec.urn == urns.BYTES_CODER:
                    bytes_coder_id = coder_id
                    break
            else:
                bytes_coder_id = unique_name(coders, 'bytes_coder')
                pipeline_components.coders[bytes_coder_id].CopyFrom(
                    beam.coders.BytesCoder().to_runner_api(None))

            coder_substitutions = {}

            def wrap_unknown_coders(coder_id, with_bytes):
                if (coder_id, with_bytes) not in coder_substitutions:
                    wrapped_coder_id = None
                    coder_proto = coders[coder_id]
                    if coder_proto.spec.spec.urn == urns.LENGTH_PREFIX_CODER:
                        coder_substitutions[coder_id, with_bytes] = (
                            bytes_coder_id if with_bytes else coder_id)
                    elif coder_proto.spec.spec.urn in good_coder_urns:
                        wrapped_components = [
                            wrap_unknown_coders(c, with_bytes)
                            for c in coder_proto.component_coder_ids
                        ]
                        if wrapped_components == list(
                                coder_proto.component_coder_ids):
                            # Use as is.
                            coder_substitutions[coder_id,
                                                with_bytes] = coder_id
                        else:
                            wrapped_coder_id = unique_name(
                                coders, coder_id +
                                ("_bytes" if with_bytes else "_len_prefix"))
                            coders[wrapped_coder_id].CopyFrom(coder_proto)
                            coders[wrapped_coder_id].component_coder_ids[:] = [
                                wrap_unknown_coders(c, with_bytes)
                                for c in coder_proto.component_coder_ids
                            ]
                            coder_substitutions[coder_id,
                                                with_bytes] = wrapped_coder_id
                    else:
                        # Not a known coder.
                        if with_bytes:
                            coder_substitutions[coder_id,
                                                with_bytes] = bytes_coder_id
                        else:
                            wrapped_coder_id = unique_name(
                                coders, coder_id + "_len_prefix")
                            len_prefix_coder_proto = beam_runner_api_pb2.Coder(
                                spec=beam_runner_api_pb2.SdkFunctionSpec(
                                    spec=beam_runner_api_pb2.FunctionSpec(
                                        urn=urns.LENGTH_PREFIX_CODER)),
                                component_coder_ids=[coder_id])
                            coders[wrapped_coder_id].CopyFrom(
                                len_prefix_coder_proto)
                            coder_substitutions[coder_id,
                                                with_bytes] = wrapped_coder_id
                    # This operation is idempotent.
                    if wrapped_coder_id:
                        coder_substitutions[wrapped_coder_id,
                                            with_bytes] = wrapped_coder_id
                return coder_substitutions[coder_id, with_bytes]

            def fix_pcoll_coder(pcoll):
                new_coder_id = wrap_unknown_coders(pcoll.coder_id, False)
                safe_coders[new_coder_id] = wrap_unknown_coders(
                    pcoll.coder_id, True)
                pcoll.coder_id = new_coder_id

            for stage in stages:
                assert len(stage.transforms) == 1
                transform = stage.transforms[0]
                if transform.spec.urn == urns.GROUP_BY_KEY_TRANSFORM:
                    for pcoll_id in transform.inputs.values():
                        fix_pcoll_coder(
                            pipeline_components.pcollections[pcoll_id])
                    for pcoll_id in transform.outputs.values():
                        fix_pcoll_coder(
                            pipeline_components.pcollections[pcoll_id])

                    # This is used later to correlate the read and write.
                    param = str("group:%s" % stage.name)
                    gbk_write = Stage(transform.unique_name + '/Write', [
                        beam_runner_api_pb2.PTransform(
                            unique_name=transform.unique_name + '/Write',
                            inputs=transform.inputs,
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_OUTPUT_URN,
                                payload=param))
                    ],
                                      downstream_side_inputs=frozenset(),
                                      must_follow=stage.must_follow)
                    yield gbk_write

                    yield Stage(transform.unique_name + '/Read', [
                        beam_runner_api_pb2.PTransform(
                            unique_name=transform.unique_name + '/Read',
                            outputs=transform.outputs,
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_INPUT_URN,
                                payload=param))
                    ],
                                downstream_side_inputs=frozenset(),
                                must_follow=union(frozenset([gbk_write]),
                                                  stage.must_follow))
                else:
                    yield stage
示例#22
0
  def _map_task_to_protos(self, map_task, data_operation_spec):
    input_data = {}
    side_input_data = {}
    runner_sinks = {}

    context = pipeline_context.PipelineContext()
    transform_protos = {}
    used_pcollections = {}

    def uniquify(*names):
      # An injective mapping from string* to string.
      return ':'.join("%s:%d" % (name, len(name)) for name in names)

    def pcollection_id(op_ix, out_ix):
      if (op_ix, out_ix) not in used_pcollections:
        used_pcollections[op_ix, out_ix] = uniquify(
            map_task[op_ix][0], 'out', str(out_ix))
      return used_pcollections[op_ix, out_ix]

    def get_inputs(op):
      if hasattr(op, 'inputs'):
        inputs = op.inputs
      elif hasattr(op, 'input'):
        inputs = [op.input]
      else:
        inputs = []
      return {'in%s' % ix: pcollection_id(*input)
              for ix, input in enumerate(inputs)}

    def get_outputs(op_ix):
      op = map_task[op_ix][1]
      return {tag: pcollection_id(op_ix, out_ix)
              for out_ix, tag in enumerate(getattr(op, 'output_tags', ['out']))}

    for op_ix, (stage_name, operation) in enumerate(map_task):
      transform_id = uniquify(stage_name)

      if isinstance(operation, operation_specs.WorkerInMemoryWrite):
        # Write this data back to the runner.
        target_name = only_element(get_inputs(operation).keys())
        runner_sinks[(transform_id, target_name)] = operation
        transform_spec = beam_runner_api_pb2.FunctionSpec(
            urn=bundle_processor.DATA_OUTPUT_URN,
            any_param=proto_utils.pack_Any(data_operation_spec),
            payload=data_operation_spec.SerializeToString() \
                if data_operation_spec is not None else None)

      elif isinstance(operation, operation_specs.WorkerRead):
        # A Read from an in-memory source is done over the data plane.
        if (isinstance(operation.source.source,
                       maptask_executor_runner.InMemorySource)
            and isinstance(operation.source.source.default_output_coder(),
                           WindowedValueCoder)):
          target_name = only_element(get_outputs(op_ix).keys())
          input_data[(transform_id, target_name)] = self._reencode_elements(
              operation.source.source.read(None),
              operation.source.source.default_output_coder())
          transform_spec = beam_runner_api_pb2.FunctionSpec(
              urn=bundle_processor.DATA_INPUT_URN,
              any_param=proto_utils.pack_Any(data_operation_spec),
              payload=data_operation_spec.SerializeToString() \
                  if data_operation_spec is not None else None)

        else:
          # Otherwise serialize the source and execute it there.
          # TODO: Use SDFs with an initial impulse.
          # The Dataflow runner harness strips the base64 encoding. do the same
          # here until we get the same thing back that we sent in.
          source_bytes = base64.b64decode(
              pickler.dumps(operation.source.source))
          transform_spec = beam_runner_api_pb2.FunctionSpec(
              urn=bundle_processor.PYTHON_SOURCE_URN,
              any_param=proto_utils.pack_Any(
                  wrappers_pb2.BytesValue(
                      value=source_bytes)),
              payload=source_bytes)

      elif isinstance(operation, operation_specs.WorkerDoFn):
        # Record the contents of each side input for access via the state api.
        side_input_extras = []
        for si in operation.side_inputs:
          assert isinstance(si.source, iobase.BoundedSource)
          element_coder = si.source.default_output_coder()
          # TODO(robertwb): Actually flesh out the ViewFn API.
          side_input_extras.append((si.tag, element_coder))
          side_input_data[
              bundle_processor.side_input_tag(transform_id, si.tag)] = (
                  self._reencode_elements(
                      si.source.read(si.source.get_range_tracker(None, None)),
                      element_coder))
        augmented_serialized_fn = pickler.dumps(
            (operation.serialized_fn, side_input_extras))
        transform_spec = beam_runner_api_pb2.FunctionSpec(
            urn=bundle_processor.PYTHON_DOFN_URN,
            any_param=proto_utils.pack_Any(
                wrappers_pb2.BytesValue(value=augmented_serialized_fn)),
            payload=augmented_serialized_fn)

      elif isinstance(operation, operation_specs.WorkerFlatten):
        # Flatten is nice and simple.
        transform_spec = beam_runner_api_pb2.FunctionSpec(
            urn=bundle_processor.IDENTITY_DOFN_URN)

      else:
        raise NotImplementedError(operation)

      transform_protos[transform_id] = beam_runner_api_pb2.PTransform(
          unique_name=stage_name,
          spec=transform_spec,
          inputs=get_inputs(operation),
          outputs=get_outputs(op_ix))

    pcollection_protos = {
        name: beam_runner_api_pb2.PCollection(
            unique_name=name,
            coder_id=context.coders.get_id(
                map_task[op_id][1].output_coders[out_id]))
        for (op_id, out_id), name in used_pcollections.items()
    }
    # Must follow creation of pcollection_protos to capture used coders.
    context_proto = context.to_runner_api()
    process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
        id=self._next_uid(),
        transforms=transform_protos,
        pcollections=pcollection_protos,
        coders=dict(context_proto.coders.items()),
        windowing_strategies=dict(context_proto.windowing_strategies.items()),
        environments=dict(context_proto.environments.items()))
    return input_data, side_input_data, runner_sinks, process_bundle_descriptor
示例#23
0
    def to_runner_api_transform(self, context, full_label):
        pcoll_renames = {}
        renamed_tag_seen = False
        for tag, pcoll in self._inputs.items():
            if tag not in self._expanded_transform.inputs:
                if renamed_tag_seen:
                    raise RuntimeError(
                        'Ambiguity due to non-preserved tags: %s vs %s' %
                        (sorted(self._expanded_transform.inputs.keys()),
                         sorted(self._inputs.keys())))
                else:
                    renamed_tag_seen = True
                    tag, = self._expanded_transform.inputs.keys()
            pcoll_renames[self._expanded_transform.inputs[tag]] = (
                context.pcollections.get_id(pcoll))
        for tag, pcoll in self._outputs.items():
            pcoll_renames[self._expanded_transform.outputs[tag]] = (
                context.pcollections.get_id(pcoll))

        def _equivalent(coder1, coder2):
            return coder1 == coder2 or _normalize(coder1) == _normalize(coder2)

        def _normalize(coder_proto):
            normalized = copy.copy(coder_proto)
            normalized.spec.environment_id = ''
            # TODO(robertwb): Normalize components as well.
            return normalized

        for id, proto in self._expanded_components.coders.items():
            if id.startswith(self._namespace):
                context.coders.put_proto(id, proto)
            elif id in context.coders:
                if not _equivalent(context.coders._id_to_proto[id], proto):
                    raise RuntimeError(
                        'Re-used coder id: %s\n%s\n%s' %
                        (id, context.coders._id_to_proto[id], proto))
            else:
                context.coders.put_proto(id, proto)
        for id, proto in self._expanded_components.windowing_strategies.items(
        ):
            if id.startswith(self._namespace):
                context.windowing_strategies.put_proto(id, proto)
        for id, proto in self._expanded_components.environments.items():
            if id.startswith(self._namespace):
                context.environments.put_proto(id, proto)
        for id, proto in self._expanded_components.pcollections.items():
            id = pcoll_renames.get(id, id)
            if id not in context.pcollections._id_to_obj.keys():
                context.pcollections.put_proto(id, proto)

        for id, proto in self._expanded_components.transforms.items():
            if id.startswith(self._IMPULSE_PREFIX):
                # Our fake inputs.
                continue
            assert id.startswith(self._namespace), (id, self._namespace)
            new_proto = beam_runner_api_pb2.PTransform(
                unique_name=full_label +
                proto.unique_name[len(self._EXPANDED_TRANSFORM_UNIQUE_NAME):],
                spec=proto.spec,
                subtransforms=proto.subtransforms,
                inputs={
                    tag: pcoll_renames.get(pcoll, pcoll)
                    for tag, pcoll in proto.inputs.items()
                },
                outputs={
                    tag: pcoll_renames.get(pcoll, pcoll)
                    for tag, pcoll in proto.outputs.items()
                })
            context.transforms.put_proto(id, new_proto)

        return beam_runner_api_pb2.PTransform(
            unique_name=full_label,
            spec=self._expanded_transform.spec,
            subtransforms=self._expanded_transform.subtransforms,
            inputs=self._expanded_transform.inputs,
            outputs={
                tag: pcoll_renames.get(pcoll, pcoll)
                for tag, pcoll in self._expanded_transform.outputs.items()
            })
示例#24
0
    def make_process_bundle_descriptor(self, data_api_service_descriptor,
                                       state_api_service_descriptor):
        # type: (Optional[endpoints_pb2.ApiServiceDescriptor], Optional[endpoints_pb2.ApiServiceDescriptor]) -> beam_fn_api_pb2.ProcessBundleDescriptor
        """Creates a ProcessBundleDescriptor for invoking the WindowFn's
    merge operation.
    """
        def make_channel_payload(coder_id):
            # type: (str) -> bytes
            data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id)
            if data_api_service_descriptor:
                data_spec.api_service_descriptor.url = (
                    data_api_service_descriptor.url)
            return data_spec.SerializeToString()

        pipeline_context = self._execution_context_ref().pipeline_context
        global_windowing_strategy_id = self.uid('global_windowing_strategy')
        global_windowing_strategy_proto = core.Windowing(
            window.GlobalWindows()).to_runner_api(pipeline_context)
        coders = dict(pipeline_context.coders.get_id_to_proto_map())

        def make_coder(urn, *components):
            # type: (str, str) -> str
            coder_proto = beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.FunctionSpec(urn=urn),
                component_coder_ids=components)
            coder_id = self.uid('coder')
            coders[coder_id] = coder_proto
            pipeline_context.coders.put_proto(coder_id, coder_proto)
            return coder_id

        bytes_coder_id = make_coder(common_urns.coders.BYTES.urn)
        window_coder_id = self._windowing_strategy_proto.window_coder_id
        global_window_coder_id = make_coder(
            common_urns.coders.GLOBAL_WINDOW.urn)
        iter_window_coder_id = make_coder(common_urns.coders.ITERABLE.urn,
                                          window_coder_id)
        input_coder_id = make_coder(common_urns.coders.KV.urn, bytes_coder_id,
                                    iter_window_coder_id)
        output_coder_id = make_coder(
            common_urns.coders.KV.urn, bytes_coder_id,
            make_coder(
                common_urns.coders.KV.urn, iter_window_coder_id,
                make_coder(
                    common_urns.coders.ITERABLE.urn,
                    make_coder(common_urns.coders.KV.urn, window_coder_id,
                               iter_window_coder_id))))
        windowed_input_coder_id = make_coder(
            common_urns.coders.WINDOWED_VALUE.urn, input_coder_id,
            global_window_coder_id)
        windowed_output_coder_id = make_coder(
            common_urns.coders.WINDOWED_VALUE.urn, output_coder_id,
            global_window_coder_id)

        self.windowed_input_coder_impl = pipeline_context.coders[
            windowed_input_coder_id].get_impl()
        self.windowed_output_coder_impl = pipeline_context.coders[
            windowed_output_coder_id].get_impl()

        self._bundle_processor_id = self.uid('merge_windows')
        return beam_fn_api_pb2.ProcessBundleDescriptor(
            id=self._bundle_processor_id,
            transforms={
                self.TO_SDK_TRANSFORM:
                beam_runner_api_pb2.PTransform(
                    unique_name='MergeWindows/Read',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=bundle_processor.DATA_INPUT_URN,
                        payload=make_channel_payload(windowed_input_coder_id)),
                    outputs={'input': 'input'}),
                'Merge':
                beam_runner_api_pb2.PTransform(
                    unique_name='MergeWindows/Merge',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.primitives.MERGE_WINDOWS.urn,
                        payload=self._windowing_strategy_proto.window_fn.
                        SerializeToString()),
                    inputs={'input': 'input'},
                    outputs={'output': 'output'}),
                self.FROM_SDK_TRANSFORM:
                beam_runner_api_pb2.PTransform(
                    unique_name='MergeWindows/Write',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=bundle_processor.DATA_OUTPUT_URN,
                        payload=make_channel_payload(
                            windowed_output_coder_id)),
                    inputs={'output': 'output'}),
            },
            pcollections={
                'input':
                beam_runner_api_pb2.PCollection(
                    unique_name='input',
                    windowing_strategy_id=global_windowing_strategy_id,
                    coder_id=input_coder_id),
                'output':
                beam_runner_api_pb2.PCollection(
                    unique_name='output',
                    windowing_strategy_id=global_windowing_strategy_id,
                    coder_id=output_coder_id),
            },
            coders=coders,
            windowing_strategies={
                global_windowing_strategy_id: global_windowing_strategy_proto,
            },
            environments=dict(self._execution_context_ref().
                              pipeline_components.environments.items()),
            state_api_service_descriptor=state_api_service_descriptor,
            timer_api_service_descriptor=data_api_service_descriptor)
def inject_timer_pcollections(stages, pipeline_context):
    """Create PCollections for fired timers and to-be-set timers.

  At execution time, fired timers and timers-to-set are represented as
  PCollections that are managed by the runner.  This phase adds the
  necissary collections, with their read and writes, to any stages using
  timers.
  """
    for stage in stages:
        for transform in list(stage.transforms):
            if transform.spec.urn == common_urns.primitives.PAR_DO.urn:
                payload = proto_utils.parse_Bytes(
                    transform.spec.payload, beam_runner_api_pb2.ParDoPayload)
                for tag, spec in payload.timer_specs.items():
                    if len(transform.inputs) > 1:
                        raise NotImplementedError('Timers and side inputs.')
                    input_pcoll = pipeline_context.components.pcollections[
                        next(iter(transform.inputs.values()))]
                    # Create the appropriate coder for the timer PCollection.
                    key_coder_id = input_pcoll.coder_id
                    if (pipeline_context.components.coders[key_coder_id].spec.
                            spec.urn == common_urns.coders.KV.urn):
                        key_coder_id = pipeline_context.components.coders[
                            key_coder_id].component_coder_ids[0]
                    key_timer_coder_id = pipeline_context.add_or_get_coder_id(
                        beam_runner_api_pb2.Coder(
                            spec=beam_runner_api_pb2.SdkFunctionSpec(
                                spec=beam_runner_api_pb2.FunctionSpec(
                                    urn=common_urns.coders.KV.urn)),
                            component_coder_ids=[
                                key_coder_id, spec.timer_coder_id
                            ]))
                    # Inject the read and write pcollections.
                    timer_read_pcoll = unique_name(
                        pipeline_context.components.pcollections,
                        '%s_timers_to_read_%s' % (transform.unique_name, tag))
                    timer_write_pcoll = unique_name(
                        pipeline_context.components.pcollections,
                        '%s_timers_to_write_%s' % (transform.unique_name, tag))
                    pipeline_context.components.pcollections[
                        timer_read_pcoll].CopyFrom(
                            beam_runner_api_pb2.PCollection(
                                unique_name=timer_read_pcoll,
                                coder_id=key_timer_coder_id,
                                windowing_strategy_id=input_pcoll.
                                windowing_strategy_id,
                                is_bounded=input_pcoll.is_bounded))
                    pipeline_context.components.pcollections[
                        timer_write_pcoll].CopyFrom(
                            beam_runner_api_pb2.PCollection(
                                unique_name=timer_write_pcoll,
                                coder_id=key_timer_coder_id,
                                windowing_strategy_id=input_pcoll.
                                windowing_strategy_id,
                                is_bounded=input_pcoll.is_bounded))
                    stage.transforms.append(
                        beam_runner_api_pb2.PTransform(
                            unique_name=timer_read_pcoll + '/Read',
                            outputs={'out': timer_read_pcoll},
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_INPUT_URN,
                                payload=create_buffer_id(timer_read_pcoll,
                                                         kind='timers'))))
                    stage.transforms.append(
                        beam_runner_api_pb2.PTransform(
                            unique_name=timer_write_pcoll + '/Write',
                            inputs={'in': timer_write_pcoll},
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_OUTPUT_URN,
                                payload=create_buffer_id(timer_write_pcoll,
                                                         kind='timers'))))
                    assert tag not in transform.inputs
                    transform.inputs[tag] = timer_read_pcoll
                    assert tag not in transform.outputs
                    transform.outputs[tag] = timer_write_pcoll
                    stage.timer_pcollections.append(
                        (timer_read_pcoll + '/Read', timer_write_pcoll))
        yield stage
def lift_combiners(stages, context):
    """Expands CombinePerKey into pre- and post-grouping stages.

  ... -> CombinePerKey -> ...

  becomes

  ... -> PreCombine -> GBK -> MergeAccumulators -> ExtractOutput -> ...
  """
    for stage in stages:
        assert len(stage.transforms) == 1
        transform = stage.transforms[0]
        if transform.spec.urn == common_urns.composites.COMBINE_PER_KEY.urn:
            combine_payload = proto_utils.parse_Bytes(
                transform.spec.payload, beam_runner_api_pb2.CombinePayload)

            input_pcoll = context.components.pcollections[only_element(
                list(transform.inputs.values()))]
            output_pcoll = context.components.pcollections[only_element(
                list(transform.outputs.values()))]

            element_coder_id = input_pcoll.coder_id
            element_coder = context.components.coders[element_coder_id]
            key_coder_id, _ = element_coder.component_coder_ids
            accumulator_coder_id = combine_payload.accumulator_coder_id

            key_accumulator_coder = beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.SdkFunctionSpec(
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.coders.KV.urn)),
                component_coder_ids=[key_coder_id, accumulator_coder_id])
            key_accumulator_coder_id = context.add_or_get_coder_id(
                key_accumulator_coder)

            accumulator_iter_coder = beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.SdkFunctionSpec(
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.coders.ITERABLE.urn)),
                component_coder_ids=[accumulator_coder_id])
            accumulator_iter_coder_id = context.add_or_get_coder_id(
                accumulator_iter_coder)

            key_accumulator_iter_coder = beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.SdkFunctionSpec(
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.coders.KV.urn)),
                component_coder_ids=[key_coder_id, accumulator_iter_coder_id])
            key_accumulator_iter_coder_id = context.add_or_get_coder_id(
                key_accumulator_iter_coder)

            precombined_pcoll_id = unique_name(context.components.pcollections,
                                               'pcollection')
            context.components.pcollections[precombined_pcoll_id].CopyFrom(
                beam_runner_api_pb2.PCollection(
                    unique_name=transform.unique_name + '/Precombine.out',
                    coder_id=key_accumulator_coder_id,
                    windowing_strategy_id=input_pcoll.windowing_strategy_id,
                    is_bounded=input_pcoll.is_bounded))

            grouped_pcoll_id = unique_name(context.components.pcollections,
                                           'pcollection')
            context.components.pcollections[grouped_pcoll_id].CopyFrom(
                beam_runner_api_pb2.PCollection(
                    unique_name=transform.unique_name + '/Group.out',
                    coder_id=key_accumulator_iter_coder_id,
                    windowing_strategy_id=output_pcoll.windowing_strategy_id,
                    is_bounded=output_pcoll.is_bounded))

            merged_pcoll_id = unique_name(context.components.pcollections,
                                          'pcollection')
            context.components.pcollections[merged_pcoll_id].CopyFrom(
                beam_runner_api_pb2.PCollection(
                    unique_name=transform.unique_name + '/Merge.out',
                    coder_id=key_accumulator_coder_id,
                    windowing_strategy_id=output_pcoll.windowing_strategy_id,
                    is_bounded=output_pcoll.is_bounded))

            def make_stage(base_stage, transform):
                return Stage(
                    transform.unique_name, [transform],
                    downstream_side_inputs=base_stage.downstream_side_inputs,
                    must_follow=base_stage.must_follow,
                    parent=base_stage.name)

            yield make_stage(
                stage,
                beam_runner_api_pb2.PTransform(
                    unique_name=transform.unique_name + '/Precombine',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.combine_components.
                        COMBINE_PER_KEY_PRECOMBINE.urn,
                        payload=transform.spec.payload),
                    inputs=transform.inputs,
                    outputs={'out': precombined_pcoll_id}))

            yield make_stage(
                stage,
                beam_runner_api_pb2.PTransform(
                    unique_name=transform.unique_name + '/Group',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.primitives.GROUP_BY_KEY.urn),
                    inputs={'in': precombined_pcoll_id},
                    outputs={'out': grouped_pcoll_id}))

            yield make_stage(
                stage,
                beam_runner_api_pb2.PTransform(
                    unique_name=transform.unique_name + '/Merge',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.combine_components.
                        COMBINE_PER_KEY_MERGE_ACCUMULATORS.urn,
                        payload=transform.spec.payload),
                    inputs={'in': grouped_pcoll_id},
                    outputs={'out': merged_pcoll_id}))

            yield make_stage(
                stage,
                beam_runner_api_pb2.PTransform(
                    unique_name=transform.unique_name + '/ExtractOutputs',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.combine_components.
                        COMBINE_PER_KEY_EXTRACT_OUTPUTS.urn,
                        payload=transform.spec.payload),
                    inputs={'in': merged_pcoll_id},
                    outputs=transform.outputs))

        else:
            yield stage
示例#27
0
  def executable_stage_transform(
      self, known_runner_urns, all_consumers, components):
    if (len(self.transforms) == 1
        and self.transforms[0].spec.urn in known_runner_urns):
      return self.transforms[0]

    else:
      all_inputs = set(
          pcoll for t in self.transforms for pcoll in t.inputs.values())
      all_outputs = set(
          pcoll for t in self.transforms for pcoll in t.outputs.values())
      internal_transforms = set(id(t) for t in self.transforms)
      external_outputs = [pcoll for pcoll in all_outputs
                          if all_consumers[pcoll] - internal_transforms]

      stage_components = beam_runner_api_pb2.Components()
      stage_components.CopyFrom(components)

      # Only keep the referenced PCollections.
      for pcoll_id in stage_components.pcollections.keys():
        if pcoll_id not in all_inputs and pcoll_id not in all_outputs:
          del stage_components.pcollections[pcoll_id]

      # Only keep the transforms in this stage.
      # Also gather up payload data as we iterate over the transforms.
      stage_components.transforms.clear()
      main_inputs = set()
      side_inputs = []
      user_states = []
      timers = []
      for ix, transform in enumerate(self.transforms):
        transform_id = 'transform_%d' % ix
        if transform.spec.urn == common_urns.primitives.PAR_DO.urn:
          payload = proto_utils.parse_Bytes(
              transform.spec.payload, beam_runner_api_pb2.ParDoPayload)
          for tag in payload.side_inputs.keys():
            side_inputs.append(
                beam_runner_api_pb2.ExecutableStagePayload.SideInputId(
                    transform_id=transform_id,
                    local_name=tag))
          for tag in payload.state_specs.keys():
            user_states.append(
                beam_runner_api_pb2.ExecutableStagePayload.UserStateId(
                    transform_id=transform_id,
                    local_name=tag))
          for tag in payload.timer_specs.keys():
            timers.append(
                beam_runner_api_pb2.ExecutableStagePayload.TimerId(
                    transform_id=transform_id,
                    local_name=tag))
          main_inputs.update(
              pcoll_id
              for tag, pcoll_id in transform.inputs.items()
              if tag not in payload.side_inputs)
        else:
          main_inputs.update(transform.inputs.values())
        stage_components.transforms[transform_id].CopyFrom(transform)

      main_input_id = only_element(main_inputs - all_outputs)
      named_inputs = dict({
          '%s:%s' % (side.transform_id, side.local_name):
          stage_components.transforms[side.transform_id].inputs[side.local_name]
          for side in side_inputs
      }, main_input=main_input_id)
      payload = beam_runner_api_pb2.ExecutableStagePayload(
          environment=components.environments[self.environment],
          input=main_input_id,
          outputs=external_outputs,
          transforms=stage_components.transforms.keys(),
          components=stage_components,
          side_inputs=side_inputs,
          user_states=user_states,
          timers=timers)

      return beam_runner_api_pb2.PTransform(
          unique_name=unique_name(None, self.name),
          spec=beam_runner_api_pb2.FunctionSpec(
              urn='beam:runner:executable_stage:v1',
              payload=payload.SerializeToString()),
          inputs=named_inputs,
          outputs={'output_%d' % ix: pcoll
                   for ix, pcoll in enumerate(external_outputs)})
示例#28
0
        def sink_flattens(stages):
            """Sink flattens and remove them from the graph.

      A flatten that cannot be sunk/fused away becomes multiple writes (to the
      same logical sink) followed by a read.
      """
            # TODO(robertwb): Actually attempt to sink rather than always materialize.
            # TODO(robertwb): Possibly fuse this into one of the stages.
            pcollections = pipeline_components.pcollections
            for stage in stages:
                assert len(stage.transforms) == 1
                transform = stage.transforms[0]
                if transform.spec.urn == urns.FLATTEN_TRANSFORM:
                    # This is used later to correlate the read and writes.
                    param = str("materialize:%s" % transform.unique_name)
                    output_pcoll_id, = transform.outputs.values()
                    output_coder_id = pcollections[output_pcoll_id].coder_id
                    flatten_writes = []
                    for local_in, pcoll_in in transform.inputs.items():

                        if pcollections[pcoll_in].coder_id != output_coder_id:
                            # Flatten inputs must all be written with the same coder as is
                            # used to read them.
                            pcollections[pcoll_in].coder_id = output_coder_id
                            transcoded_pcollection = (transform.unique_name +
                                                      '/Transcode/' +
                                                      local_in + '/out')
                            yield Stage(
                                transform.unique_name + '/Transcode/' +
                                local_in, [
                                    beam_runner_api_pb2.PTransform(
                                        unique_name=transform.unique_name +
                                        '/Transcode/' + local_in,
                                        inputs={local_in: pcoll_in},
                                        outputs={
                                            'out': transcoded_pcollection
                                        },
                                        spec=beam_runner_api_pb2.FunctionSpec(
                                            urn=bundle_processor.
                                            IDENTITY_DOFN_URN))
                                ],
                                downstream_side_inputs=frozenset(),
                                must_follow=stage.must_follow)
                            pcollections[transcoded_pcollection].CopyFrom(
                                pcollections[pcoll_in])
                            pcollections[
                                transcoded_pcollection].coder_id = output_coder_id
                        else:
                            transcoded_pcollection = pcoll_in

                        flatten_write = Stage(
                            transform.unique_name + '/Write/' + local_in, [
                                beam_runner_api_pb2.PTransform(
                                    unique_name=transform.unique_name +
                                    '/Write/' + local_in,
                                    inputs={local_in: transcoded_pcollection},
                                    spec=beam_runner_api_pb2.FunctionSpec(
                                        urn=bundle_processor.DATA_OUTPUT_URN,
                                        payload=param))
                            ],
                            downstream_side_inputs=frozenset(),
                            must_follow=stage.must_follow)
                        flatten_writes.append(flatten_write)
                        yield flatten_write

                    yield Stage(transform.unique_name + '/Read', [
                        beam_runner_api_pb2.PTransform(
                            unique_name=transform.unique_name + '/Read',
                            outputs=transform.outputs,
                            spec=beam_runner_api_pb2.FunctionSpec(
                                urn=bundle_processor.DATA_INPUT_URN,
                                payload=param))
                    ],
                                downstream_side_inputs=frozenset(),
                                must_follow=union(frozenset(flatten_writes),
                                                  stage.must_follow))

                else:
                    yield stage
示例#29
0
        def greedily_fuse(stages):
            """Places transforms sharing an edge in the same stage, whenever possible.
      """
            producers_by_pcoll = {}
            consumers_by_pcoll = collections.defaultdict(list)

            # Used to always reference the correct stage as the producer and
            # consumer maps are not updated when stages are fused away.
            replacements = {}

            def replacement(s):
                old_ss = []
                while s in replacements:
                    old_ss.append(s)
                    s = replacements[s]
                for old_s in old_ss[:-1]:
                    replacements[old_s] = s
                return s

            def fuse(producer, consumer):
                fused = producer.fuse(consumer)
                replacements[producer] = fused
                replacements[consumer] = fused

            # First record the producers and consumers of each PCollection.
            for stage in stages:
                for transform in stage.transforms:
                    for input in transform.inputs.values():
                        consumers_by_pcoll[input].append(stage)
                    for output in transform.outputs.values():
                        producers_by_pcoll[output] = stage

            logging.debug('consumers\n%s', consumers_by_pcoll)
            logging.debug('producers\n%s', producers_by_pcoll)

            # Now try to fuse away all pcollections.
            for pcoll, producer in producers_by_pcoll.items():
                pcoll_as_param = str("materialize:%s" % pcoll)
                write_pcoll = None
                for consumer in consumers_by_pcoll[pcoll]:
                    producer = replacement(producer)
                    consumer = replacement(consumer)
                    # Update consumer.must_follow set, as it's used in can_fuse.
                    consumer.must_follow = frozenset(
                        replacement(s) for s in consumer.must_follow)
                    if producer.can_fuse(consumer):
                        fuse(producer, consumer)
                    else:
                        # If we can't fuse, do a read + write.
                        if write_pcoll is None:
                            write_pcoll = Stage(pcoll + '/Write', [
                                beam_runner_api_pb2.PTransform(
                                    unique_name=pcoll + '/Write',
                                    inputs={'in': pcoll},
                                    spec=beam_runner_api_pb2.FunctionSpec(
                                        urn=bundle_processor.DATA_OUTPUT_URN,
                                        payload=pcoll_as_param))
                            ])
                            fuse(producer, write_pcoll)
                        if consumer.has_as_main_input(pcoll):
                            read_pcoll = Stage(pcoll + '/Read', [
                                beam_runner_api_pb2.PTransform(
                                    unique_name=pcoll + '/Read',
                                    outputs={'out': pcoll},
                                    spec=beam_runner_api_pb2.FunctionSpec(
                                        urn=bundle_processor.DATA_INPUT_URN,
                                        payload=pcoll_as_param))
                            ],
                                               must_follow=frozenset(
                                                   [write_pcoll]))
                            fuse(read_pcoll, consumer)
                        else:
                            consumer.must_follow = union(
                                consumer.must_follow, frozenset([write_pcoll]))

            # Everything that was originally a stage or a replacement, but wasn't
            # replaced, should be in the final graph.
            final_stages = frozenset(stages).union(
                replacements.values()).difference(replacements.keys())

            for stage in final_stages:
                # Update all references to their final values before throwing
                # the replacement data away.
                stage.must_follow = frozenset(
                    replacement(s) for s in stage.must_follow)
                # Two reads of the same stage may have been fused.  This is unneeded.
                stage.deduplicate_read()
            return final_stages
示例#30
0
    def _analyze_pipeline(self):
        """Analyzes the pipeline and sets the variables that can be queried.

    This function construct Pipeline proto to execute by
      1. Start from target PCollections and recursively insert the producing
         PTransforms of those PCollections, where the producing PTransforms are
         either ReadCache or PTransforms in the original pipeline.
      2. Append WriteCache PTransforsm in the pipeline.

    After running this function, the following variables will be set:
      self._pipeline_proto_to_execute
      self._top_level_referenced_pcoll_ids
      self._top_level_required_transforms
      self._caches_used
      self._read_cache_ids
      self._write_cache_ids
    """
        # We filter PTransforms to be executed bottom-up from these PCollections.
        desired_pcollections = self._desired_pcollections(self._pipeline_info)

        required_transforms = collections.OrderedDict()
        top_level_required_transforms = collections.OrderedDict()

        for pcoll_id in desired_pcollections:
            # TODO(qinyeli): Collections consumed by no-output transforms.
            self._insert_producing_transforms(pcoll_id, required_transforms,
                                              top_level_required_transforms)

        top_level_referenced_pcoll_ids = self._referenced_pcoll_ids(
            top_level_required_transforms)

        for pcoll_id in self._pipeline_info.all_pcollections():
            if not pcoll_id in top_level_referenced_pcoll_ids:
                continue

            if (pcoll_id in desired_pcollections
                    and not pcoll_id in self._caches_used):
                self._insert_caching_transforms(pcoll_id, required_transforms,
                                                top_level_required_transforms)

            if not self._cache_manager.exists(
                    'sample', self._pipeline_info.cache_label(pcoll_id)):
                self._insert_caching_transforms(pcoll_id,
                                                required_transforms,
                                                top_level_required_transforms,
                                                sample=True)

        required_transforms['_root'] = beam_runner_api_pb2.PTransform(
            subtransforms=list(top_level_required_transforms))

        referenced_pcoll_ids = self._referenced_pcoll_ids(required_transforms)
        referenced_pcollections = {}
        for pcoll_id in referenced_pcoll_ids:
            obj = self._context.pcollections.get_by_id(pcoll_id)
            proto = self._context.pcollections.get_proto(obj)
            referenced_pcollections[pcoll_id] = proto

        pipeline_to_execute = beam_runner_api_pb2.Pipeline()
        pipeline_to_execute.root_transform_ids[:] = ['_root']
        set_proto_map(pipeline_to_execute.components.transforms,
                      required_transforms)
        set_proto_map(pipeline_to_execute.components.pcollections,
                      referenced_pcollections)
        set_proto_map(pipeline_to_execute.components.coders,
                      self._context.to_runner_api().coders)
        set_proto_map(pipeline_to_execute.components.windowing_strategies,
                      self._context.to_runner_api().windowing_strategies)

        self._pipeline_proto_to_execute = pipeline_to_execute
        self._top_level_referenced_pcoll_ids = top_level_referenced_pcoll_ids
        self._top_level_required_transforms = top_level_required_transforms