def setUp(self):
     self.pipeline = Pipeline(DirectRunner())
     self.visitor = ConsumerTrackingPipelineVisitor()
     try:  # Python 2
         self.assertCountEqual = self.assertItemsEqual
     except AttributeError:  # Python 3
         pass
Exemplo n.º 2
0
    def run_pipeline(self, pipeline, options):
        """Execute the entire pipeline and returns an DirectPipelineResult."""

        # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
        # with resolving imports when they are at top.
        # pylint: disable=wrong-import-position
        from apache_beam.pipeline import PipelineVisitor
        from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
          ConsumerTrackingPipelineVisitor
        from apache_beam.runners.direct.evaluation_context import EvaluationContext
        from apache_beam.runners.direct.executor import Executor
        from apache_beam.runners.direct.transform_evaluator import \
          TransformEvaluatorRegistry
        from apache_beam.testing.test_stream import TestStream

        # Performing configured PTransform overrides.
        pipeline.replace_all(_get_transform_overrides(options))

        # If the TestStream I/O is used, use a mock test clock.
        class _TestStreamUsageVisitor(PipelineVisitor):
            """Visitor determining whether a Pipeline uses a TestStream."""
            def __init__(self):
                self.uses_test_stream = False

            def visit_transform(self, applied_ptransform):
                if isinstance(applied_ptransform.transform, TestStream):
                    self.uses_test_stream = True

        visitor = _TestStreamUsageVisitor()
        pipeline.visit(visitor)
        clock = TestClock() if visitor.uses_test_stream else RealClock()

        # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring.
        from apache_beam.metrics.execution import MetricsEnvironment
        MetricsEnvironment.set_metrics_supported(True)
        logging.info('Running pipeline with DirectRunner.')
        self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
        pipeline.visit(self.consumer_tracking_visitor)

        evaluation_context = EvaluationContext(
            options,
            BundleFactory(stacked=options.view_as(
                DirectOptions).direct_runner_use_stacked_bundle),
            self.consumer_tracking_visitor.root_transforms,
            self.consumer_tracking_visitor.value_to_consumers,
            self.consumer_tracking_visitor.step_names,
            self.consumer_tracking_visitor.views, clock)

        executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                            TransformEvaluatorRegistry(evaluation_context),
                            evaluation_context)
        # DirectRunner does not support injecting
        # PipelineOptions values at runtime
        RuntimeValueProvider.set_runtime_options({})
        # Start the executor. This is a non-blocking call, it will start the
        # execution in background threads and return.
        executor.start(self.consumer_tracking_visitor.root_transforms)
        result = DirectPipelineResult(executor, evaluation_context)

        return result
Exemplo n.º 3
0
  def run(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(group_by_key_input_visitor())
    pipeline.visit(self.consumer_tracking_visitor)

    evaluation_context = EvaluationContext(
        pipeline.options,
        BundleFactory(stacked=pipeline.options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.

    if pipeline.options:
      RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {})
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

      # Unset runtime options after the pipeline finishes.
      # TODO: Move this to a post finish hook and clean for all cases.
      if pipeline.options:
        RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id)

    return result
Exemplo n.º 4
0
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # Performing configured PTransform overrides.
    pipeline.replace_all(self._ptransform_overrides)

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    clock = TestClock() if self._use_test_clock else RealClock()
    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

    return result
Exemplo n.º 5
0
    def test_visitor_not_sorted(self):
        p = Pipeline()
        # pylint: disable=expression-not-assigned
        from apache_beam.testing.test_stream import TestStream
        p | TestStream().add_elements(['']) | beam.Map(lambda _: _)

        original_graph = p.to_runner_api(return_context=False)
        out_of_order_graph = p.to_runner_api(return_context=False)

        root_id = out_of_order_graph.root_transform_ids[0]
        root = out_of_order_graph.components.transforms[root_id]
        tmp = root.subtransforms[0]
        root.subtransforms[0] = root.subtransforms[1]
        root.subtransforms[1] = tmp

        p = beam.Pipeline().from_runner_api(out_of_order_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_out_of_order = ConsumerTrackingPipelineVisitor()
        p.visit(v_out_of_order)

        p = beam.Pipeline().from_runner_api(original_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_original = ConsumerTrackingPipelineVisitor()
        p.visit(v_original)

        # Convert to string to assert they are equal.
        out_of_order_labels = {
            str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]]
            for k in v_out_of_order.value_to_consumers
        }

        original_labels = {
            str(k): [str(t) for t in v_original.value_to_consumers[k]]
            for k in v_original.value_to_consumers
        }
        self.assertDictEqual(out_of_order_labels, original_labels)
 def setUp(self):
     self.pipeline = Pipeline(DirectRunner())
     self.visitor = ConsumerTrackingPipelineVisitor()