Exemplo n.º 1
0
def _get_transform_overrides(pipeline_options):
    # A list of PTransformOverride objects to be applied before running a pipeline
    # using DirectRunner.
    # Currently this only works for overrides where the input and output types do
    # not change.
    # For internal use only; no backwards-compatibility guarantees.

    # Importing following locally to avoid a circular dependency.
    from apache_beam.pipeline import PTransformOverride
    from apache_beam.runners.direct.helper_transforms import LiftedCombinePerKey
    from apache_beam.runners.direct.sdf_direct_runner import ProcessKeyedElementsViaKeyedWorkItemsOverride
    from apache_beam.runners.direct.sdf_direct_runner import SplittableParDoOverride

    class CombinePerKeyOverride(PTransformOverride):
        def matches(self, applied_ptransform):
            if isinstance(applied_ptransform.transform, CombinePerKey):
                return applied_ptransform.inputs[0].windowing.is_default()

        def get_replacement_transform_for_applied_ptransform(
                self, applied_ptransform):
            # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
            # with resolving imports when they are at top.
            # pylint: disable=wrong-import-position
            try:
                transform = applied_ptransform.transform
                return LiftedCombinePerKey(transform.fn, transform.args,
                                           transform.kwargs)
            except NotImplementedError:
                return transform

    class StreamingGroupByKeyOverride(PTransformOverride):
        def matches(self, applied_ptransform):
            # Note: we match the exact class, since we replace it with a subclass.
            return applied_ptransform.transform.__class__ == _GroupByKeyOnly

        def get_replacement_transform_for_applied_ptransform(
                self, applied_ptransform):
            # Use specialized streaming implementation.
            transform = _StreamingGroupByKeyOnly()
            return transform

    class StreamingGroupAlsoByWindowOverride(PTransformOverride):
        def matches(self, applied_ptransform):
            # Note: we match the exact class, since we replace it with a subclass.
            transform = applied_ptransform.transform
            return (isinstance(applied_ptransform.transform, ParDo)
                    and isinstance(transform.dofn, _GroupAlsoByWindowDoFn)
                    and transform.__class__ != _StreamingGroupAlsoByWindow)

        def get_replacement_transform_for_applied_ptransform(
                self, applied_ptransform):
            # Use specialized streaming implementation.
            transform = _StreamingGroupAlsoByWindow(
                applied_ptransform.transform.dofn.windowing)
            return transform

    class TestStreamOverride(PTransformOverride):
        def matches(self, applied_ptransform):
            from apache_beam.testing.test_stream import TestStream
            self.applied_ptransform = applied_ptransform
            return isinstance(applied_ptransform.transform, TestStream)

        def get_replacement_transform_for_applied_ptransform(
                self, applied_ptransform):
            from apache_beam.runners.direct.test_stream_impl import _ExpandableTestStream
            return _ExpandableTestStream(applied_ptransform.transform)

    class GroupByKeyPTransformOverride(PTransformOverride):
        """A ``PTransformOverride`` for ``GroupByKey``.

    This replaces the Beam implementation as a primitive.
    """
        def matches(self, applied_ptransform):
            # Imported here to avoid circular dependencies.
            # pylint: disable=wrong-import-order, wrong-import-position
            from apache_beam.transforms.core import GroupByKey
            return isinstance(applied_ptransform.transform, GroupByKey)

        def get_replacement_transform_for_applied_ptransform(
                self, applied_ptransform):
            return _GroupByKey()

    overrides = [
        # This needs to be the first and the last override. Other overrides depend
        # on the GroupByKey implementation to be composed of _GroupByKeyOnly and
        # _GroupAlsoByWindow.
        GroupByKeyPTransformOverride(),
        SplittableParDoOverride(),
        ProcessKeyedElementsViaKeyedWorkItemsOverride(),
        CombinePerKeyOverride(),
        TestStreamOverride(),
    ]

    # Add streaming overrides, if necessary.
    if pipeline_options.view_as(StandardOptions).streaming:
        overrides.append(StreamingGroupByKeyOverride())
        overrides.append(StreamingGroupAlsoByWindowOverride())

    # Add PubSub overrides, if PubSub is available.
    try:
        from apache_beam.io.gcp import pubsub as unused_pubsub
        overrides += _get_pubsub_transform_overrides(pipeline_options)
    except ImportError:
        pass

    # This also needs to be last because other transforms apply GBKs which need to
    # be translated into a DirectRunner-compatible transform.
    overrides.append(GroupByKeyPTransformOverride())

    return overrides
Exemplo n.º 2
0
def _get_transform_overrides(pipeline_options):
  # A list of PTransformOverride objects to be applied before running a pipeline
  # using DirectRunner.
  # Currently this only works for overrides where the input and output types do
  # not change.
  # For internal use only; no backwards-compatibility guarantees.

  # Importing following locally to avoid a circular dependency.
  from apache_beam.pipeline import PTransformOverride
  from apache_beam.runners.direct.helper_transforms import LiftedCombinePerKey
  from apache_beam.runners.direct.sdf_direct_runner import ProcessKeyedElementsViaKeyedWorkItemsOverride
  from apache_beam.runners.direct.sdf_direct_runner import SplittableParDoOverride

  class CombinePerKeyOverride(PTransformOverride):
    def matches(self, applied_ptransform):
      if isinstance(applied_ptransform.transform, CombinePerKey):
        return applied_ptransform.inputs[0].windowing.is_default()

    def get_replacement_transform(self, transform):
      # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
      # with resolving imports when they are at top.
      # pylint: disable=wrong-import-position
      try:
        return LiftedCombinePerKey(
            transform.fn, transform.args, transform.kwargs)
      except NotImplementedError:
        return transform

  class StreamingGroupByKeyOverride(PTransformOverride):
    def matches(self, applied_ptransform):
      # Note: we match the exact class, since we replace it with a subclass.
      return applied_ptransform.transform.__class__ == _GroupByKeyOnly

    def get_replacement_transform(self, transform):
      # Use specialized streaming implementation.
      transform = _StreamingGroupByKeyOnly()
      return transform

  class StreamingGroupAlsoByWindowOverride(PTransformOverride):
    def matches(self, applied_ptransform):
      # Note: we match the exact class, since we replace it with a subclass.
      transform = applied_ptransform.transform
      return (
          isinstance(applied_ptransform.transform, ParDo) and
          isinstance(transform.dofn, _GroupAlsoByWindowDoFn) and
          transform.__class__ != _StreamingGroupAlsoByWindow)

    def get_replacement_transform(self, transform):
      # Use specialized streaming implementation.
      transform = _StreamingGroupAlsoByWindow(transform.dofn.windowing)
      return transform

  overrides = [
      SplittableParDoOverride(),
      ProcessKeyedElementsViaKeyedWorkItemsOverride(),
      CombinePerKeyOverride()
  ]

  # Add streaming overrides, if necessary.
  if pipeline_options.view_as(StandardOptions).streaming:
    overrides.append(StreamingGroupByKeyOverride())
    overrides.append(StreamingGroupAlsoByWindowOverride())

  # Add PubSub overrides, if PubSub is available.
  try:
    from apache_beam.io.gcp import pubsub as unused_pubsub
    overrides += _get_pubsub_transform_overrides(pipeline_options)
  except ImportError:
    pass

  return overrides