Пример #1
0
    def test_clear(self):
        """Tests that clear can empty the cache for a specific pipeline."""

        # Create two pipelines so we can check that clearing the cache won't clear
        # all defined pipelines.
        p1 = beam.Pipeline(InteractiveRunner())
        elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2])

        p2 = beam.Pipeline(InteractiveRunner())
        elems_2 = p2 | 'elems 2' >> beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm_1 = RecordingManager(p1)
        recording = rm_1.record([elems_1], max_n=3, max_duration=500)
        recording.wait_until_finish()

        rm_2 = RecordingManager(p2)
        recording = rm_2.record([elems_2], max_n=3, max_duration=500)
        recording.wait_until_finish()

        # Assert that clearing only one recording clears that recording.
        self.assertGreater(rm_1.describe()['size'], 0)
        self.assertGreater(rm_2.describe()['size'], 0)
        rm_1.clear()
        self.assertEqual(rm_1.describe()['size'], 0)
        self.assertGreater(rm_2.describe()['size'], 0)

        rm_2.clear()
        self.assertEqual(rm_2.describe()['size'], 0)
    def test_basic_wordcount(self):
        """A wordcount to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        recording = rm.record([elems], max_n=3, max_duration_secs=500)
        stream = recording.stream(elems)
        recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)
Пример #3
0
    def test_cancel_stops_recording(self):
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)

        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Get the recording then the BackgroundCachingJob.
        rm = RecordingManager(p)
        recording = rm.record([squares], max_n=10, max_duration=30)

        # The BackgroundCachingJob is still waiting for more elements, so it isn't
        # done yet.
        bcj = ie.current_env().get_background_caching_job(p)
        self.assertFalse(bcj.is_done())

        # Assert that something was read and that the BackgroundCachingJob was
        # sucessfully stopped.
        self.assertTrue(list(recording.stream(squares).read()))
        rm.cancel()
        self.assertTrue(bcj.is_done())
Пример #4
0
    def test_record_pipeline(self):
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        # pylint: disable=unused-variable
        _ = (p
             | TestStream()
                 .advance_watermark_to(0)
                 .advance_processing_time(1)
                 .add_elements(list(range(10)))
                 .advance_processing_time(1))  # yapf: disable

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a lmiter that stops the background caching job when something is
        # written to cache. This is used to make ensure that the pipeline is
        # functioning properly and that there are no data races with the test.
        class SizeLimiter(Limiter):
            def __init__(self, p):
                self.pipeline = p
                self._rm = None

            def set_recording_manager(self, rm):
                self._rm = rm

            def is_triggered(self):
                return self._rm.describe()['size'] > 0 if self._rm else False

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        size_limiter = SizeLimiter(p)
        rm = RecordingManager(p, test_limiters=[size_limiter])
        size_limiter.set_recording_manager(rm)
        self.assertEqual(rm.describe()['state'], PipelineState.STOPPED)
        self.assertTrue(rm.record_pipeline())

        # A recording is in progress, no need to start another one.
        self.assertFalse(rm.record_pipeline())

        for _ in range(60):
            if rm.describe()['state'] == PipelineState.CANCELLED:
                break
            time.sleep(1)
        self.assertTrue(
            rm.describe()['state'] == PipelineState.CANCELLED,
            'Test timed out waiting for pipeline to be cancelled. This indicates '
            'that the BackgroundCachingJob did not cache anything.')
Пример #5
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)
        rm.record([squares], max_n=10, max_duration=2)
        first_recording_start = rm.describe()['start']
        rm.cancel()

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)
        cache = ie.current_env().get_cache_manager(p)
        cache_key = pipeline_instrument.cache_key(squares)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        cache.clear = MagicMock()

        # Rerun the fragment. If the cache was cleared correctly then the starting
        # time of the second recording will be later than the first. This is because
        # the PCollection wasn't considered to be computedand was cleared from
        # cache. Thus the pipeline fragment was rerun for that PCollection at a
        # later time.
        rm.record([squares], max_n=10, max_duration=1)
        second_recording_start = rm.describe()['start']
        rm.cancel()
        self.assertGreater(second_recording_start, first_recording_start)

        # Assert that the cache cleared the PCollection.
        cache.clear.assert_called_with('full', cache_key)
Пример #6
0
    def test_clear(self):
        p1 = beam.Pipeline(InteractiveRunner())
        elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2])

        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        recording_manager = RecordingManager(p1)
        recording = recording_manager.record([elems_1],
                                             max_n=3,
                                             max_duration=500)
        recording.wait_until_finish()
        record_describe = recording_manager.describe()
        self.assertGreater(record_describe['size'], 0)
        recording_manager.clear()
        self.assertEqual(recording_manager.describe()['size'], 0)
Пример #7
0
    def test_duration_parsing(self):
        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects.
        rm = RecordingManager(p)
        recording = rm.record([elems], max_n=3, max_duration='500s')
        recording.wait_until_finish()

        # Assert that the duration was parsed correctly to integer seconds.
        self.assertEqual(recording.describe()['duration'], 500)
Пример #8
0
    def test_describe(self):
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        ib.watch(locals())

        # Create a MockPipelineResult to control the state of a fake run of the
        # pipeline.
        mock_result = MockPipelineResult()
        ie.current_env().track_user_pipelines()
        ie.current_env().set_pipeline_result(p, mock_result)

        cache_manager = InMemoryCache()
        ie.current_env().set_cache_manager(cache_manager, p)

        # Create a recording with an arbitrary start time.
        start_time = 100
        recording = Recording(p, [numbers, letters],
                              mock_result,
                              pi.PipelineInstrument(p),
                              max_n=10,
                              max_duration_secs=60,
                              start_time_for_test=start_time)

        # Get the cache key of the stream and write something to cache. This is
        # so that a pipeline doesn't have to run in the test.
        numbers_stream = recording.stream(numbers)
        cache_manager.write([0, 1, 2], 'full', numbers_stream.cache_key)
        cache_manager.save_pcoder(None, 'full', numbers_stream.cache_key)

        letters_stream = recording.stream(letters)
        cache_manager.write(['a', 'b', 'c'], 'full', letters_stream.cache_key)
        cache_manager.save_pcoder(None, 'full', letters_stream.cache_key)

        # Get the description.
        description = recording.describe()
        size = description['size']
        start = description['start']

        self.assertEqual(
            size,
            cache_manager.size('full', numbers_stream.cache_key) +
            cache_manager.size('full', letters_stream.cache_key))
        self.assertEqual(start, start_time)
Пример #9
0
    def test_basic_execution(self):
        """A basic pipeline to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        numbers_recording = rm.record([numbers],
                                      max_n=3,
                                      max_duration_secs=500)
        numbers_stream = numbers_recording.stream(numbers)
        numbers_recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(numbers_stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)

        # Make an extra recording and test the description.
        letters_recording = rm.record([letters],
                                      max_n=3,
                                      max_duration_secs=500)
        letters_recording.wait_until_finish()

        self.assertEqual(
            rm.describe()['size'],
            numbers_recording.describe()['size'] +
            letters_recording.describe()['size'])

        rm.cancel()
Пример #10
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        rm._clear_pcolls = MagicMock()
        rm.record([squares], max_n=1, max_duration=500)
        rm.cancel()

        # Assert that the cache cleared the PCollection.
        rm._clear_pcolls.assert_any_call(
            unittest.mock.ANY,
            set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))
Пример #11
0
    def test_record_detects_remote_runner(self, mock_pipeline_fragment,
                                          mock_clear_pcolls):
        """Tests that a remote runner is detected, resulting in the
    PipelineFragment instance to have blocking enabled."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])

        # Set the cache directory for Interactive Beam to be in a GCS bucket.
        ib.options.cache_root = 'gs://test-bucket/'

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)

        # Run record() and check if the PipelineFragment.run had blocking set to
        # True due to the GCS cache_root value.
        rm.record([numbers], max_n=3, max_duration=500)
        mock_pipeline_fragment.assert_called_with(blocking=True)

        # Reset cache_root value.
        ib.options.cache_root = None
Пример #12
0
 def test_read_in_interactive_runner(self):
     p = beam.Pipeline(InteractiveRunner(), argv=self.args)
     pcoll = p | beam.io.ReadFromBigQuery(query="SELECT 1")
     result = interactive_beam.collect(pcoll)
     assert result.iloc[0, 0] == 1
Пример #13
0
 def test_detect_pipeline_no_underlying_runner(self):
     p = beam.Pipeline(InteractiveRunner())
     pipeline_runner = utils.detect_pipeline_runner(p)
     from apache_beam.runners.direct.direct_runner import DirectRunner
     self.assertTrue(isinstance(pipeline_runner, DirectRunner))
Пример #14
0
 def test_detect_pipeline_underlying_runner(self):
     p = beam.Pipeline(InteractiveRunner(underlying_runner=FlinkRunner()))
     pipeline_runner = utils.detect_pipeline_runner(p)
     self.assertTrue(isinstance(pipeline_runner, FlinkRunner))
Пример #15
0
    def test_computed(self):
        """Tests that a PCollection is marked as computed only in a complete state.

    Because the background caching job is now long-lived, repeated runs of a
    PipelineFragment may yield different results for the same PCollection.
    """

        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        ib.watch(locals())

        # Create a MockPipelineResult to control the state of a fake run of the
        # pipeline.
        mock_result = MockPipelineResult()
        ie.current_env().track_user_pipelines()
        ie.current_env().set_pipeline_result(p, mock_result)

        # Create a mock BackgroundCachingJob that will control whether to set the
        # PCollections as computed or not.
        bcj_mock_result = MockPipelineResult()
        background_caching_job = bcj.BackgroundCachingJob(bcj_mock_result, [])

        # Create a recording.
        recording = Recording(p, [elems],
                              mock_result,
                              max_n=10,
                              max_duration_secs=60)

        # The background caching job and the recording isn't done yet so there may
        # be more elements to be recorded.
        self.assertFalse(recording.is_computed())
        self.assertFalse(recording.computed())
        self.assertTrue(recording.uncomputed())

        # The recording is finished but the background caching job is not. There
        # may still be more elements to record, or the intermediate PCollection may
        # have stopped caching in an incomplete state, e.g. before a window could
        # fire.
        mock_result.set_state(PipelineState.DONE)
        recording.wait_until_finish()

        self.assertFalse(recording.is_computed())
        self.assertFalse(recording.computed())
        self.assertTrue(recording.uncomputed())

        # The background caching job finished before we started a recording which
        # is a sure signal that there will be no more elements.
        bcj_mock_result.set_state(PipelineState.DONE)
        ie.current_env().set_background_caching_job(p, background_caching_job)
        recording = Recording(p, [elems],
                              mock_result,
                              max_n=10,
                              max_duration_secs=60)
        recording.wait_until_finish()

        # There are no more elements and the recording finished, meaning that the
        # intermediate PCollections are in a complete state. They can now be marked
        # as computed.
        self.assertTrue(recording.is_computed())
        self.assertTrue(recording.computed())
        self.assertFalse(recording.uncomputed())