def test_clear(self): """Tests that clear can empty the cache for a specific pipeline.""" # Create two pipelines so we can check that clearing the cache won't clear # all defined pipelines. p1 = beam.Pipeline(InteractiveRunner()) elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2]) p2 = beam.Pipeline(InteractiveRunner()) elems_2 = p2 | 'elems 2' >> beam.Create([0, 1, 2]) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm_1 = RecordingManager(p1) recording = rm_1.record([elems_1], max_n=3, max_duration=500) recording.wait_until_finish() rm_2 = RecordingManager(p2) recording = rm_2.record([elems_2], max_n=3, max_duration=500) recording.wait_until_finish() # Assert that clearing only one recording clears that recording. self.assertGreater(rm_1.describe()['size'], 0) self.assertGreater(rm_2.describe()['size'], 0) rm_1.clear() self.assertEqual(rm_1.describe()['size'], 0) self.assertGreater(rm_2.describe()['size'], 0) rm_2.clear() self.assertEqual(rm_2.describe()['size'], 0)
def test_basic_wordcount(self): """A wordcount to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) recording = rm.record([elems], max_n=3, max_duration_secs=500) stream = recording.stream(elems) recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems)
def test_cancel_stops_recording(self): # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Get the recording then the BackgroundCachingJob. rm = RecordingManager(p) recording = rm.record([squares], max_n=10, max_duration=30) # The BackgroundCachingJob is still waiting for more elements, so it isn't # done yet. bcj = ie.current_env().get_background_caching_job(p) self.assertFalse(bcj.is_done()) # Assert that something was read and that the BackgroundCachingJob was # sucessfully stopped. self.assertTrue(list(recording.stream(squares).read())) rm.cancel() self.assertTrue(bcj.is_done())
def test_record_pipeline(self): # Add the TestStream so that it can be cached. ib.options.recordable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) # pylint: disable=unused-variable _ = (p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(list(range(10))) .advance_processing_time(1)) # yapf: disable # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Create a lmiter that stops the background caching job when something is # written to cache. This is used to make ensure that the pipeline is # functioning properly and that there are no data races with the test. class SizeLimiter(Limiter): def __init__(self, p): self.pipeline = p self._rm = None def set_recording_manager(self, rm): self._rm = rm def is_triggered(self): return self._rm.describe()['size'] > 0 if self._rm else False # Do the first recording to get the timestamp of the first time the fragment # was run. size_limiter = SizeLimiter(p) rm = RecordingManager(p, test_limiters=[size_limiter]) size_limiter.set_recording_manager(rm) self.assertEqual(rm.describe()['state'], PipelineState.STOPPED) self.assertTrue(rm.record_pipeline()) # A recording is in progress, no need to start another one. self.assertFalse(rm.record_pipeline()) for _ in range(60): if rm.describe()['state'] == PipelineState.CANCELLED: break time.sleep(1) self.assertTrue( rm.describe()['state'] == PipelineState.CANCELLED, 'Test timed out waiting for pipeline to be cancelled. This indicates ' 'that the BackgroundCachingJob did not cache anything.')
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) rm.record([squares], max_n=10, max_duration=2) first_recording_start = rm.describe()['start'] rm.cancel() # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) cache = ie.current_env().get_cache_manager(p) cache_key = pipeline_instrument.cache_key(squares) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. cache.clear = MagicMock() # Rerun the fragment. If the cache was cleared correctly then the starting # time of the second recording will be later than the first. This is because # the PCollection wasn't considered to be computedand was cleared from # cache. Thus the pipeline fragment was rerun for that PCollection at a # later time. rm.record([squares], max_n=10, max_duration=1) second_recording_start = rm.describe()['start'] rm.cancel() self.assertGreater(second_recording_start, first_recording_start) # Assert that the cache cleared the PCollection. cache.clear.assert_called_with('full', cache_key)
def test_clear(self): p1 = beam.Pipeline(InteractiveRunner()) elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2]) ib.watch(locals()) ie.current_env().track_user_pipelines() recording_manager = RecordingManager(p1) recording = recording_manager.record([elems_1], max_n=3, max_duration=500) recording.wait_until_finish() record_describe = recording_manager.describe() self.assertGreater(record_describe['size'], 0) recording_manager.clear() self.assertEqual(recording_manager.describe()['size'], 0)
def test_duration_parsing(self): p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. rm = RecordingManager(p) recording = rm.record([elems], max_n=3, max_duration='500s') recording.wait_until_finish() # Assert that the duration was parsed correctly to integer seconds. self.assertEqual(recording.describe()['duration'], 500)
def test_describe(self): p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) letters = p | 'letters' >> beam.Create(['a', 'b', 'c']) ib.watch(locals()) # Create a MockPipelineResult to control the state of a fake run of the # pipeline. mock_result = MockPipelineResult() ie.current_env().track_user_pipelines() ie.current_env().set_pipeline_result(p, mock_result) cache_manager = InMemoryCache() ie.current_env().set_cache_manager(cache_manager, p) # Create a recording with an arbitrary start time. start_time = 100 recording = Recording(p, [numbers, letters], mock_result, pi.PipelineInstrument(p), max_n=10, max_duration_secs=60, start_time_for_test=start_time) # Get the cache key of the stream and write something to cache. This is # so that a pipeline doesn't have to run in the test. numbers_stream = recording.stream(numbers) cache_manager.write([0, 1, 2], 'full', numbers_stream.cache_key) cache_manager.save_pcoder(None, 'full', numbers_stream.cache_key) letters_stream = recording.stream(letters) cache_manager.write(['a', 'b', 'c'], 'full', letters_stream.cache_key) cache_manager.save_pcoder(None, 'full', letters_stream.cache_key) # Get the description. description = recording.describe() size = description['size'] start = description['start'] self.assertEqual( size, cache_manager.size('full', numbers_stream.cache_key) + cache_manager.size('full', letters_stream.cache_key)) self.assertEqual(start, start_time)
def test_basic_execution(self): """A basic pipeline to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) letters = p | 'letters' >> beam.Create(['a', 'b', 'c']) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) numbers_recording = rm.record([numbers], max_n=3, max_duration_secs=500) numbers_stream = numbers_recording.stream(numbers) numbers_recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(numbers_stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems) # Make an extra recording and test the description. letters_recording = rm.record([letters], max_n=3, max_duration_secs=500) letters_recording.wait_until_finish() self.assertEqual( rm.describe()['size'], numbers_recording.describe()['size'] + letters_recording.describe()['size']) rm.cancel()
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.recordable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. rm._clear_pcolls = MagicMock() rm.record([squares], max_n=1, max_duration=500) rm.cancel() # Assert that the cache cleared the PCollection. rm._clear_pcolls.assert_any_call( unittest.mock.ANY, set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))
def test_record_detects_remote_runner(self, mock_pipeline_fragment, mock_clear_pcolls): """Tests that a remote runner is detected, resulting in the PipelineFragment instance to have blocking enabled.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) # Set the cache directory for Interactive Beam to be in a GCS bucket. ib.options.cache_root = 'gs://test-bucket/' # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) # Run record() and check if the PipelineFragment.run had blocking set to # True due to the GCS cache_root value. rm.record([numbers], max_n=3, max_duration=500) mock_pipeline_fragment.assert_called_with(blocking=True) # Reset cache_root value. ib.options.cache_root = None
def test_read_in_interactive_runner(self): p = beam.Pipeline(InteractiveRunner(), argv=self.args) pcoll = p | beam.io.ReadFromBigQuery(query="SELECT 1") result = interactive_beam.collect(pcoll) assert result.iloc[0, 0] == 1
def test_detect_pipeline_no_underlying_runner(self): p = beam.Pipeline(InteractiveRunner()) pipeline_runner = utils.detect_pipeline_runner(p) from apache_beam.runners.direct.direct_runner import DirectRunner self.assertTrue(isinstance(pipeline_runner, DirectRunner))
def test_detect_pipeline_underlying_runner(self): p = beam.Pipeline(InteractiveRunner(underlying_runner=FlinkRunner())) pipeline_runner = utils.detect_pipeline_runner(p) self.assertTrue(isinstance(pipeline_runner, FlinkRunner))
def test_computed(self): """Tests that a PCollection is marked as computed only in a complete state. Because the background caching job is now long-lived, repeated runs of a PipelineFragment may yield different results for the same PCollection. """ p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) ib.watch(locals()) # Create a MockPipelineResult to control the state of a fake run of the # pipeline. mock_result = MockPipelineResult() ie.current_env().track_user_pipelines() ie.current_env().set_pipeline_result(p, mock_result) # Create a mock BackgroundCachingJob that will control whether to set the # PCollections as computed or not. bcj_mock_result = MockPipelineResult() background_caching_job = bcj.BackgroundCachingJob(bcj_mock_result, []) # Create a recording. recording = Recording(p, [elems], mock_result, max_n=10, max_duration_secs=60) # The background caching job and the recording isn't done yet so there may # be more elements to be recorded. self.assertFalse(recording.is_computed()) self.assertFalse(recording.computed()) self.assertTrue(recording.uncomputed()) # The recording is finished but the background caching job is not. There # may still be more elements to record, or the intermediate PCollection may # have stopped caching in an incomplete state, e.g. before a window could # fire. mock_result.set_state(PipelineState.DONE) recording.wait_until_finish() self.assertFalse(recording.is_computed()) self.assertFalse(recording.computed()) self.assertTrue(recording.uncomputed()) # The background caching job finished before we started a recording which # is a sure signal that there will be no more elements. bcj_mock_result.set_state(PipelineState.DONE) ie.current_env().set_background_caching_job(p, background_caching_job) recording = Recording(p, [elems], mock_result, max_n=10, max_duration_secs=60) recording.wait_until_finish() # There are no more elements and the recording finished, meaning that the # intermediate PCollections are in a complete state. They can now be marked # as computed. self.assertTrue(recording.is_computed()) self.assertTrue(recording.computed()) self.assertFalse(recording.uncomputed())