def test_able_to_cache_intermediate_unbounded_source_pcollection(self):
        """Tests being able to cache an intermediate source PCollection.

    In the following pipeline, the source doesn't have a reference and so is
    not automatically cached in the watch() command. This tests that this case
    is taken care of.
    """
        # Create a new interactive environment to make the test idempotent.
        ie.new_env(cache_manager=streaming_cache.StreamingCache(
            cache_dir=None))

        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner(),
                                   options)

        # pylint: disable=possibly-unused-variable
        source_1 = (
            p_original
            | 'source1' >> beam.io.ReadFromPubSub(
                subscription='projects/fake-project/subscriptions/fake_sub')
            | beam.Map(lambda e: e))

        # Watch but do not cache the PCollections.
        ib.watch(locals())

        def cache_key_of(name, pcoll):
            return name + '_' + str(id(pcoll)) + '_' + str(id(pcoll.producer))

        # Make sure that sources without a user reference are still cached.
        instr.watch_sources(p_original)

        intermediate_source_pcoll = None
        for watching in ie.current_env().watching():
            watching = list(watching)
            for var, watchable in watching:
                if 'synthetic' in var:
                    intermediate_source_pcoll = watchable
                    break

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        intermediate_source_pcoll_cache_key = \
            cache_key_of('synthetic_var_' + str(id(intermediate_source_pcoll)),
                         intermediate_source_pcoll)
        p_expected = beam.Pipeline()

        test_stream = (
            p_expected
            | TestStream(output_tags=[intermediate_source_pcoll_cache_key]))
        # pylint: disable=expression-not-assigned
        (test_stream
         | 'square1' >> beam.Map(lambda e: e)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().cache_manager(), 'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([intermediate_source_pcoll_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(
            self, p_expected.to_runner_api(use_fake_coders=True),
            instrumenter.instrumented_pipeline_proto())
 def setUp(self):
     ie.new_env(cache_manager=InMemoryCache())
示例#3
0
 def setUp(self):
     ie.new_env(cache_manager=cache.FileBasedCacheManager())
示例#4
0
 def setUp(self):
   ie.new_env()
示例#5
0
 def test_display_plain_text_when_kernel_has_no_frontend(
         self, _mocked_sample):
     ie.new_env(
     )  # Resets the notebook check. Should be False in unit tests.
     self.assertIsNone(pv.visualize(self._pcoll))
     _mocked_sample.assert_called_once()
示例#6
0
  def test_instrument_mixed_streaming_batch(self):
    """Tests caching for both batch and streaming sources in the same pipeline.

    This ensures that cached bounded and unbounded sources are read from the
    TestStream.
    """
    # Create a new interactive environment to make the test idempotent.
    ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None))

    # Create the pipeline that will be instrumented.
    from apache_beam.options.pipeline_options import StandardOptions
    options = StandardOptions(streaming=True)
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options)
    source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    source_2 = p_original | 'source2' >> beam.Create([1, 2, 3, 4, 5])

    # pylint: disable=possibly-unused-variable
    pcoll_1 = ((source_1, source_2)
               | beam.Flatten()
               | 'square1' >> beam.Map(lambda x: x * x))

    # Watch but do not cache the PCollections.
    ib.watch(locals())

    self._mock_write_cache([b''], self.cache_key_of('source_2', source_2))
    ie.current_env().mark_pcollection_computed([source_2])

    # Instrument the original pipeline to create the pipeline the user will see.
    p_copy = beam.Pipeline.from_runner_api(
        p_original.to_runner_api(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)
    instrumenter = instr.build_pipeline_instrument(p_copy)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    source_1_cache_key = self.cache_key_of('source_1', source_1)
    source_2_cache_key = self.cache_key_of('source_2', source_2)
    p_expected = beam.Pipeline()

    test_stream = (
        p_expected
        | TestStream(output_tags=[source_1_cache_key, source_2_cache_key]))
    # pylint: disable=expression-not-assigned
    ((
        test_stream[self.cache_key_of('source_1', source_1)],
        test_stream[self.cache_key_of('source_2', source_2)])
     | beam.Flatten()
     | 'square1' >> beam.Map(lambda x: x * x)
     | 'reify' >> beam.Map(lambda _: _)
     | cache.WriteCache(ie.current_env().cache_manager(), 'unused'))

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([source_1_cache_key, source_2_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(use_fake_coders=True),
        instrumenter.instrumented_pipeline_proto())
示例#7
0
 def test_cleanup_invoked_when_cm_changed(self, mocked_cleanup):
   ie._interactive_beam_env = None
   ie.new_env(cache.FileBasedCacheManager())
   ie.current_env().set_cache_manager(cache.FileBasedCacheManager())
   mocked_cleanup.assert_called_once()
示例#8
0
 def setUp(self):
     self._var_in_class_instance = 'a var in class instance, not directly used'
     ie.new_env()
示例#9
0
 def test_no_cleanup_when_cm_none(self, mocked_atexit):
   ie.new_env(None)
   mocked_atexit.assert_not_called()
示例#10
0
 def test_cleanup_when_cm_not_none(self, mocked_atexit):
   ie.new_env(cache.FileBasedCacheManager())
   mocked_atexit.assert_called_once()
示例#11
0
 def test_get_cache_manager_creates_cache_manager_if_absent(self):
     ie.new_env()
     dummy_pipeline = 'dummy'
     self.assertIsNone(ie.current_env().get_cache_manager(dummy_pipeline))
     self.assertIsNotNone(ie.current_env().get_cache_manager(
         dummy_pipeline, create_if_absent=True))
 def tearDown(self):
     for _, job in ie.current_env()._background_caching_jobs.items():
         job.cancel()
     ie.new_env()
示例#13
0
 def tearDown(self):
   ie.new_env()
示例#14
0
    def test_instrument_example_unbounded_pipeline_to_read_cache_not_cached(
            self):
        """Tests that the instrumenter works when the PCollection is not cached.
    """
        # Create a new interactive environment to make the test idempotent.
        ie.new_env(cache_manager=streaming_cache.StreamingCache(
            cache_dir=None))

        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner(),
                                   options)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable
        pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)

        # Watch but do not cache the PCollections.
        ib.watch(locals())

        def cache_key_of(name, pcoll):
            return name + '_' + str(id(pcoll)) + '_' + str(id(pcoll.producer))

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = cache_key_of('source_1', source_1)
        p_expected = beam.Pipeline()
        test_stream = (
            p_expected
            | TestStream(output_tags=[cache_key_of('source_1', source_1)]))
        # pylint: disable=expression-not-assigned
        (test_stream
         | 'square1' >> beam.Map(lambda x: x * x)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().cache_manager(), 'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(
            self, p_expected.to_runner_api(use_fake_coders=True),
            instrumenter.instrumented_pipeline_proto())
示例#15
0
 def setUp(self):
   self._p = beam.Pipeline()
   self._var_in_class_instance = 'a var in class instance'
   ie.new_env()
示例#16
0
    def test_instrument_example_unbounded_pipeline_to_multiple_read_cache(
            self):
        """Tests that the instrumenter works for multiple unbounded sources.
    """
        # Create a new interactive environment to make the test idempotent.
        ie.new_env(cache_manager=streaming_cache.StreamingCache(
            cache_dir=None))

        # Create the pipeline that will be instrumented.
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner())
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        source_2 = p_original | 'source2' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable
        pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)
        # pylint: disable=possibly-unused-variable
        pcoll_2 = source_2 | 'square2' >> beam.Map(lambda x: x * x)

        # Mock as if cacheable PCollections are cached.
        ib.watch(locals())

        def cache_key_of(name, pcoll):
            return name + '_' + str(id(pcoll)) + '_' + str(id(pcoll.producer))

        for name, pcoll in locals().items():
            if not isinstance(pcoll, beam.pvalue.PCollection):
                continue
            cache_key = cache_key_of(name, pcoll)
            self._mock_write_cache([b''], cache_key)

        # Instrument the original pipeline to create the pipeline the user will see.
        instrumenter = instr.build_pipeline_instrument(p_original)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=None)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = cache_key_of('source_1', source_1)
        source_2_cache_key = cache_key_of('source_2', source_2)
        p_expected = beam.Pipeline()
        test_stream = (p_expected
                       | TestStream(output_tags=[
                           cache_key_of('source_1', source_1),
                           cache_key_of('source_2', source_2)
                       ]))
        # pylint: disable=expression-not-assigned
        test_stream[source_1_cache_key] | 'square1' >> beam.Map(
            lambda x: x * x)
        # pylint: disable=expression-not-assigned
        test_stream[source_2_cache_key] | 'square2' >> beam.Map(
            lambda x: x * x)

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key, source_2_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
示例#17
0
 def setUp(self):
     self._var_in_class_instance = 'a var in class instance'
     ie.new_env()