def test_sessions_default(self): self.run_trigger_simple( Sessions(10), # pyformat break DefaultTrigger(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2, -2, -1) self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (15, 'c'), (16, 'd'), (30, 'z'), (9, 'e'), (10, 'f'), (30, 'y')], { IntervalWindow(1, 26): [set('abcdef')], IntervalWindow(30, 40): [set('yz')] }, 1, 2, 3, 4, 5, 6, -4, -2, -1)
def test_reshuffle_window_fn_preserved(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ((2, 1), 1.0, IntervalWindow(1.0, 3.0)), ((3, 1), 1.0, IntervalWindow(1.0, 3.0)), ((1, 2), 2.0, IntervalWindow(2.0, 4.0)), ((2, 2), 2.0, IntervalWindow(2.0, 4.0)), ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]] expected_merged_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map( lambda v: TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2))) assert_that(before_reshuffle, equal_to(expected_windows), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_windows), label='after_reshuffle', reify_windows=True) after_group = after_reshuffle | beam.GroupByKey() assert_that(after_group, equal_to(expected_merged_windows), label='after_group', reify_windows=True) pipeline.run()
def test_reshuffle_windows_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [ TestWindowedValue(v, t, [w]) for (v, t, w) in [((1, [2, 1]), 4.0, IntervalWindow(1.0, 4.0) ), ((2, [2, 1]), 4.0, IntervalWindow(1.0, 4.0) ), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0) ), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))] ] before_reshuffle = ( pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map(lambda v: beam.window.TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2)) | 'group_by_key' >> beam.GroupByKey()) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle', reify_windows=True) after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle()) assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle', reify_windows=True) pipeline.run()
def get_window(self): """ Returns a selected beam windowing strategy :return: the selected windowing strategy """ return Sessions(self.gap_threshold)
def test_sessions_repeatedly_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('abcde')]}, 1, 3) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.DISCARDING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('de')]}, 1, 3)
def test_windowfn_encoding(self): for window_fn in (GlobalWindows(), FixedWindows(37), SlidingWindows(2, 389), Sessions(5077)): context = pipeline_context.PipelineContext() self.assertEqual( window_fn, WindowFn.from_runner_api(window_fn.to_runner_api(context), context))
def test_sessions_after_all(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abc')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abcxy')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_sessions_after_each(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterEach(AfterCount(2), AfterCount(3)), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')]}, 2) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterEach(AfterCount(2), AfterCount(3))), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')], IntervalWindow(0, 17): [set('abcdefgh')]}, 2)
def test_sessions_watermark(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2, -2, -1)
def test_sessions(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected))
def test_sessions_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'), (50, 'y')], {IntervalWindow(1, 25): [set('abc')], IntervalWindow(30, 41): [set('st')], IntervalWindow(50, 60): [set('yz')]}, 1, 2, 3)
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_sessions_watermark_with_early_late(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(early=AfterCount(2), late=AfterCount(1)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')], { IntervalWindow(1, 25): [ set('abc'), # early set('abc'), # on time set('abcxy') # late ], IntervalWindow(30, 40): [ set('d'), # on time ], IntervalWindow(1, 40): [ set('abcdxyz') # late ], }, 2, late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
def test_sessions_merging(self): windowfn = Sessions(10) def merge(*timestamps): windows = [ windowfn.assign(context(None, t, [])) for t in timestamps ] running = set() class TestMergeContext(WindowFn.MergeContext): def __init__(self): super(TestMergeContext, self).__init__(running) def merge(self, to_be_merged, merge_result): for w in to_be_merged: if w in running: running.remove(w) running.add(merge_result) for ws in windows: running.update(ws) windowfn.merge(TestMergeContext()) windowfn.merge(TestMergeContext()) return sorted(running) self.assertEqual([IntervalWindow(2, 12)], merge(2)) self.assertEqual([IntervalWindow(2, 12), IntervalWindow(19, 29)], merge(2, 19)) self.assertEqual([IntervalWindow(2, 19)], merge(2, 9)) self.assertEqual([IntervalWindow(2, 19)], merge(9, 2)) self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], merge(2, 9, 19)) self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], merge(19, 9, 2)) self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10))
def expand(self, pcoll): return (pcoll | 'ComputeSessionsWindow' >> beam.WindowInto( Sessions(gap_size=ONE_HOUR_IN_SECONDS)) | combiners.Count.PerElement())
{ "userId": "Andy", "click": 1, "timestamp": 1603113600 }, # Event time: 13:20 ]) # fmt: on # Assign timestamp to metadata of elements such that Beam's window functions can # access and use them to group events. timestamped_events = events | "AddTimestamp" >> beam.ParDo( AddTimestampDoFn()) windowed_events = timestamped_events | beam.WindowInto( # Each session must be separated by a time gap of at least 30 minutes (1800 sec) Sessions(gap_size=30 * 60), # Triggers determine when to emit the aggregated results of each window. Default # trigger outputs the aggregated result when it estimates all data has arrived, # and discards all subsequent data for that window. trigger=None, # Since a trigger can fire multiple times, the accumulation mode determines # whether the system accumulates the window panes as the trigger fires, or # discards them. accumulation_mode=None, # Policies for combining timestamps that occur within a window. Only relevant if # a grouping operation is applied to windows. timestamp_combiner=None, # By setting allowed_lateness we can handle late data. If allowed lateness is # set, the default trigger will emit new results immediately whenever late # data arrives. allowed_lateness=Duration(seconds=1 * 24 * 60 * 60), # 1 day