Exemplo n.º 1
0
    def test_clear(self):
        """Tests that clear can empty the cache for a specific pipeline."""

        # Create two pipelines so we can check that clearing the cache won't clear
        # all defined pipelines.
        p1 = beam.Pipeline(InteractiveRunner())
        elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2])

        p2 = beam.Pipeline(InteractiveRunner())
        elems_2 = p2 | 'elems 2' >> beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm_1 = RecordingManager(p1)
        recording = rm_1.record([elems_1], max_n=3, max_duration=500)
        recording.wait_until_finish()

        rm_2 = RecordingManager(p2)
        recording = rm_2.record([elems_2], max_n=3, max_duration=500)
        recording.wait_until_finish()

        # Assert that clearing only one recording clears that recording.
        self.assertGreater(rm_1.describe()['size'], 0)
        self.assertGreater(rm_2.describe()['size'], 0)
        rm_1.clear()
        self.assertEqual(rm_1.describe()['size'], 0)
        self.assertGreater(rm_2.describe()['size'], 0)

        rm_2.clear()
        self.assertEqual(rm_2.describe()['size'], 0)
Exemplo n.º 2
0
    def test_record_pipeline(self):
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        # pylint: disable=unused-variable
        _ = (p
             | TestStream()
                 .advance_watermark_to(0)
                 .advance_processing_time(1)
                 .add_elements(list(range(10)))
                 .advance_processing_time(1))  # yapf: disable

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a lmiter that stops the background caching job when something is
        # written to cache. This is used to make ensure that the pipeline is
        # functioning properly and that there are no data races with the test.
        class SizeLimiter(Limiter):
            def __init__(self, p):
                self.pipeline = p
                self._rm = None

            def set_recording_manager(self, rm):
                self._rm = rm

            def is_triggered(self):
                return self._rm.describe()['size'] > 0 if self._rm else False

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        size_limiter = SizeLimiter(p)
        rm = RecordingManager(p, test_limiters=[size_limiter])
        size_limiter.set_recording_manager(rm)
        self.assertEqual(rm.describe()['state'], PipelineState.STOPPED)
        self.assertTrue(rm.record_pipeline())

        # A recording is in progress, no need to start another one.
        self.assertFalse(rm.record_pipeline())

        for _ in range(60):
            if rm.describe()['state'] == PipelineState.CANCELLED:
                break
            time.sleep(1)
        self.assertTrue(
            rm.describe()['state'] == PipelineState.CANCELLED,
            'Test timed out waiting for pipeline to be cancelled. This indicates '
            'that the BackgroundCachingJob did not cache anything.')
Exemplo n.º 3
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)
        rm.record([squares], max_n=10, max_duration=2)
        first_recording_start = rm.describe()['start']
        rm.cancel()

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)
        cache = ie.current_env().get_cache_manager(p)
        cache_key = pipeline_instrument.cache_key(squares)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        cache.clear = MagicMock()

        # Rerun the fragment. If the cache was cleared correctly then the starting
        # time of the second recording will be later than the first. This is because
        # the PCollection wasn't considered to be computedand was cleared from
        # cache. Thus the pipeline fragment was rerun for that PCollection at a
        # later time.
        rm.record([squares], max_n=10, max_duration=1)
        second_recording_start = rm.describe()['start']
        rm.cancel()
        self.assertGreater(second_recording_start, first_recording_start)

        # Assert that the cache cleared the PCollection.
        cache.clear.assert_called_with('full', cache_key)
Exemplo n.º 4
0
    def test_clear(self):
        p1 = beam.Pipeline(InteractiveRunner())
        elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2])

        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        recording_manager = RecordingManager(p1)
        recording = recording_manager.record([elems_1],
                                             max_n=3,
                                             max_duration=500)
        recording.wait_until_finish()
        record_describe = recording_manager.describe()
        self.assertGreater(record_describe['size'], 0)
        recording_manager.clear()
        self.assertEqual(recording_manager.describe()['size'], 0)
Exemplo n.º 5
0
    def test_basic_execution(self):
        """A basic pipeline to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        numbers_recording = rm.record([numbers],
                                      max_n=3,
                                      max_duration_secs=500)
        numbers_stream = numbers_recording.stream(numbers)
        numbers_recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(numbers_stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)

        # Make an extra recording and test the description.
        letters_recording = rm.record([letters],
                                      max_n=3,
                                      max_duration_secs=500)
        letters_recording.wait_until_finish()

        self.assertEqual(
            rm.describe()['size'],
            numbers_recording.describe()['size'] +
            letters_recording.describe()['size'])

        rm.cancel()