def test_cancel_stops_recording(self): # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Get the recording then the BackgroundCachingJob. rm = RecordingManager(p) recording = rm.record([squares], max_n=10, max_duration=30) # The BackgroundCachingJob is still waiting for more elements, so it isn't # done yet. bcj = ie.current_env().get_background_caching_job(p) self.assertFalse(bcj.is_done()) # Assert that something was read and that the BackgroundCachingJob was # sucessfully stopped. self.assertTrue(list(recording.stream(squares).read())) rm.cancel() self.assertTrue(bcj.is_done())
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) rm.record([squares], max_n=10, max_duration=2) first_recording_start = rm.describe()['start'] rm.cancel() # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) cache = ie.current_env().get_cache_manager(p) cache_key = pipeline_instrument.cache_key(squares) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. cache.clear = MagicMock() # Rerun the fragment. If the cache was cleared correctly then the starting # time of the second recording will be later than the first. This is because # the PCollection wasn't considered to be computedand was cleared from # cache. Thus the pipeline fragment was rerun for that PCollection at a # later time. rm.record([squares], max_n=10, max_duration=1) second_recording_start = rm.describe()['start'] rm.cancel() self.assertGreater(second_recording_start, first_recording_start) # Assert that the cache cleared the PCollection. cache.clear.assert_called_with('full', cache_key)
def test_basic_execution(self): """A basic pipeline to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) letters = p | 'letters' >> beam.Create(['a', 'b', 'c']) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) numbers_recording = rm.record([numbers], max_n=3, max_duration_secs=500) numbers_stream = numbers_recording.stream(numbers) numbers_recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(numbers_stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems) # Make an extra recording and test the description. letters_recording = rm.record([letters], max_n=3, max_duration_secs=500) letters_recording.wait_until_finish() self.assertEqual( rm.describe()['size'], numbers_recording.describe()['size'] + letters_recording.describe()['size']) rm.cancel()
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.recordable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. rm._clear_pcolls = MagicMock() rm.record([squares], max_n=1, max_duration=500) rm.cancel() # Assert that the cache cleared the PCollection. rm._clear_pcolls.assert_any_call( unittest.mock.ANY, set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))