def read(self, tail=True): # type: (boolean) -> Any """Reads the elements currently recorded.""" # Get the cache manager and wait until the file exists. cache_manager = ie.current_env().get_cache_manager(self._pipeline) # Retrieve the coder for the particular PCollection which will be used to # decode elements read from cache. coder = cache_manager.load_pcoder('full', self._cache_key) # Read the elements from the cache. # Import limiters here to prevent a circular import. from apache_beam.runners.interactive.options.capture_limiters import CountLimiter from apache_beam.runners.interactive.options.capture_limiters import ProcessingTimeLimiter reader, _ = cache_manager.read('full', self._cache_key, tail=tail) # Because a single TestStreamFileRecord can yield multiple elements, we # limit the count again here in the to_element_list call. # # There are two ways of exiting this loop either a limiter was triggered or # all elements from the cache were read. In the latter situation, it may be # the case that the pipeline was still running. Thus, another invocation of # `read` will yield new elements. count_limiter = CountLimiter(self._n) time_limiter = ProcessingTimeLimiter(self._duration_secs) limiters = (count_limiter, time_limiter) for e in utils.to_element_list(reader, coder, include_window_info=True, n=self._n, include_time_events=True): # From the to_element_list we either get TestStreamPayload.Events if # include_time_events or decoded elements from the reader. Make sure we # only count the decoded elements to break early. if isinstance(e, TestStreamPayload.Event): time_limiter.update(e) else: count_limiter.update(e) yield e if any(l.is_triggered() for l in limiters): break # A limiter being triggered means that we have fulfilled the user's request. # This implies that reading from the cache again won't yield any new # elements. WLOG, this applies to the user pipeline being terminated. if any(l.is_triggered() for l in limiters) or ie.current_env().is_terminated(self._pipeline): self._done = True
def test_count_limiter(self): limiter = CountLimiter(5) for e in range(4): limiter.update(e) self.assertFalse(limiter.is_triggered()) limiter.update(4) self.assertTrue(limiter.is_triggered())
def test_count_limiter_with_dataframes(self): limiter = CountLimiter(5) # Test that empty dataframes don't count. for _ in range(10): df = WindowedValue(pd.DataFrame(), 0, []) limiter.update(df) self.assertFalse(limiter.is_triggered()) df = WindowedValue(pd.DataFrame({'col': list(range(10))}), 0, []) limiter.update(df) self.assertTrue(limiter.is_triggered())
def test_read_with_count_limiter(self): """Test the condition where the cache is read once after written once.""" prefix = 'full' cache_label = 'some-cache-label' cache_version_one = ['cache', 'version', 'one'] self.mock_write_cache(cache_version_one, prefix, cache_label) reader, version = self.cache_manager.read(prefix, cache_label, limiters=[CountLimiter(2)]) pcoll_list = list(reader) self.assertListEqual(pcoll_list, ['cache', 'version']) self.assertEqual(version, 0) self.assertTrue( self.cache_manager.is_latest_version(version, prefix, cache_label))
def test_single_reader_with_count_limiter(self): """Tests that we expect to see all the correctly emitted TestStreamPayloads. """ CACHED_PCOLLECTION_KEY = repr(CacheKey('arbitrary_key', '', '', '')) values = (FileRecordsBuilder(tag=CACHED_PCOLLECTION_KEY) .add_element(element=0, event_time_secs=0) .advance_processing_time(1) .add_element(element=1, event_time_secs=1) .advance_processing_time(1) .add_element(element=2, event_time_secs=2) .build()) # yapf: disable cache = StreamingCache(cache_dir=None) cache.write(values, CACHED_PCOLLECTION_KEY) reader, _ = cache.read(CACHED_PCOLLECTION_KEY, limiters=[CountLimiter(2)]) coder = coders.FastPrimitivesCoder() events = list(reader) # Units here are in microseconds. # These are a slice of the original values such that we only get two # elements. expected = [ TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(0), timestamp=0) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), TestStreamPayload.Event(element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode(1), timestamp=1 * 10**6) ], tag=CACHED_PCOLLECTION_KEY)), TestStreamPayload.Event(processing_time_event=TestStreamPayload. Event.AdvanceProcessingTime( advance_duration=1 * 10**6)), ] self.assertSequenceEqual(events, expected)
def read(self, tail=True): # type: (boolean) -> Any """Reads the elements currently recorded.""" # Get the cache manager and wait until the file exists. cache_manager = ie.current_env().get_cache_manager(self._pipeline) # Retrieve the coder for the particular PCollection which will be used to # decode elements read from cache. coder = cache_manager.load_pcoder('full', self._cache_key) # Read the elements from the cache. limiters = [ CountLimiter(self._n), ProcessingTimeLimiter(self._duration_secs) ] reader, _ = cache_manager.read('full', self._cache_key, tail=tail) # Because a single TestStreamFileRecord can yield multiple elements, we # limit the count again here in the to_element_list call. # # There are two ways of exiting this loop either a limiter was triggered or # all elements from the cache were read. In the latter situation, it may be # the case that the pipeline was still running. Thus, another invocation of # `read` will yield new elements. for e in utils.to_element_list(reader, coder, include_window_info=True, n=self._n): for l in limiters: l.update(e) yield e if any(l.is_triggered() for l in limiters): break # A limiter being triggered means that we have fulfilled the user's request. # This implies that reading from the cache again won't yield any new # elements. WLOG, this applies to the user pipeline being terminated. if any(l.is_triggered() for l in limiters) or ie.current_env().is_terminated( self._pipeline): self._done = True