def decode_from_stream(self, in_, nested): global IntervalWindow if IntervalWindow is None: from apache_beam.transforms.window import IntervalWindow typed_value = IntervalWindow(None, None) typed_value._end_micros = ( 1000 * self._to_normal_time(in_.read_bigendian_uint64())) typed_value._start_micros = ( typed_value._end_micros - 1000 * in_.read_var_int64()) return typed_value
def test_sessions_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'), (50, 'y')], { IntervalWindow(1, 25): [set('abc')], IntervalWindow(30, 41): [set('st')], IntervalWindow(50, 60): [set('yz')] }, 1, 2, 3)
def test_fixed_watermark_with_early(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab'), set('abc')]}, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc'), set('abc')]}, 3)
def test_reified_value_assert_fail_unmatched_window(self): expected = [TestWindowedValue(v, MIN_TIMESTAMP, [IntervalWindow(0, 1)]) for v in [1, 2, 3]] with self.assertRaises(Exception): with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def decode_from_stream(self, in_, nested): end_millis = self._to_normal_time(in_.read_bigendian_uint64()) start_millis = end_millis - in_.read_var_int64() from apache_beam.transforms.window import IntervalWindow ret = IntervalWindow(start=Timestamp(micros=start_millis * 1000), end=Timestamp(micros=end_millis * 1000)) return ret
def test_late_data_behavior(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \ "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \ "\"ride_status\":\"pickup\",\"passenger_count\":1}" test_stream = TestStream().advance_watermark_to(0).add_elements([ TimestampedValue(base_json_pickup, 0), TimestampedValue(base_json_pickup, 0), ]).advance_watermark_to( 60).advance_processing_time(60).add_elements([ TimestampedValue(base_json_pickup, 0) ]).advance_watermark_to(300).advance_processing_time( 240).add_elements([TimestampedValue(base_json_pickup, 0)]) EXPECTED_RESULTS = { IntervalWindow(0, 60): [2, 3] } #On Time and Late Result taxi_counts_late = (p | test_stream | TaxiCountTransform()) assert_that(taxi_counts_late, equal_to_per_window(EXPECTED_RESULTS), reify_windows=True)
def test_fixed_watermark(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (13, 'c')], { IntervalWindow(0, 10): [set('ab')], IntervalWindow(10, 20): [set('c')] }, 1, 2, 3, -3, -2, -1)
def test_reshuffle_window_fn_preserved(self): any_order = contains_in_any_order with TestPipeline() as pipeline: data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [ TestWindowedValue(v, t, [w]) for (v, t, w) in [((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ( (2, 1), 1.0, IntervalWindow(1.0, 3.0)), ( (3, 1), 1.0, IntervalWindow(1.0, 3.0)), ( (1, 2), 2.0, IntervalWindow(2.0, 4.0)), ( (2, 2), 2.0, IntervalWindow(2.0, 4.0)), ((1, 4), 4.0, IntervalWindow(4.0, 6.0))] ] expected_merged_windows = [ TestWindowedValue(v, t - .001, [w]) for (v, t, w) in [((1, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ( (2, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ( (3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))] ] before_reshuffle = ( pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map(lambda v: TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2))) assert_that( before_reshuffle, equal_to(expected_windows), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that( after_reshuffle, equal_to(expected_windows), label='after_reshuffle', reify_windows=True) after_group = after_reshuffle | beam.GroupByKey() assert_that( after_group, equal_to(expected_merged_windows), label='after_group', reify_windows=True)
def test_fixed_after_count(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')], {IntervalWindow(0, 10): [set('abc')]}, 3, 4)
def test_sessions_repeatedly_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('abcde')]}, 1, 3) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.DISCARDING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('de')]}, 1, 3)
def test_fixed_after_first(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_sessions_after_all(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abc')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abcxy')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_fixed_windows_simple_watermark(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0), tsv('k1', 2, 0), tsv('k2', 2, 0)]) .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)]) .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)]) .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)]) .advance_watermark_to(1) .add_elements([tsv('k1', 6, 0)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k2', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k1', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k2', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k1', IntervalWindow(0, 1), [6]), # After the watermark ]))
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_sessions_after_each(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterEach(AfterCount(2), AfterCount(3)), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')]}, 2) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterEach(AfterCount(2), AfterCount(3))), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')], IntervalWindow(0, 17): [set('abcdefgh')]}, 2)
def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def test_reshuffle_windows_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map( lambda v: beam.window.TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2)) | 'group_by_key' >> beam.GroupByKey()) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle', reify_windows=True) pipeline.run()
def test_sessions_watermark(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2, -2, -1)
def test_sessions_watermark_with_early_late(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(early=AfterCount(2), late=AfterCount(1)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')], { IntervalWindow(1, 25): [ set('abc'), # early set('abc'), # on time set('abcxy') # late ], IntervalWindow(30, 40): [ set('d'), # on time ], IntervalWindow(1, 40): [ set('abcdxyz') # late ], }, 2, late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
def test_shard_naming(self): namer = fileio.default_file_naming(prefix='/path/to/file', suffix='.txt') self.assertEqual(namer(GlobalWindow(), None, None, None, None, None), '/path/to/file.txt') self.assertEqual(namer(GlobalWindow(), None, 1, 5, None, None), '/path/to/file-00001-of-00005.txt') self.assertEqual(namer(GlobalWindow(), None, 1, 5, 'gz', None), '/path/to/file-00001-of-00005.txt.gz') self.assertEqual( namer(IntervalWindow(0, 100), None, 1, 5, None, None), '/path/to/file' '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt')
def test_sessions_merging(self): windowfn = Sessions(10) def merge(*timestamps): windows = [ windowfn.assign(context(None, t, [])) for t in timestamps ] running = set() class TestMergeContext(WindowFn.MergeContext): def __init__(self): super(TestMergeContext, self).__init__(running) def merge(self, to_be_merged, merge_result): for w in to_be_merged: if w in running: running.remove(w) running.add(merge_result) for ws in windows: running.update(ws) windowfn.merge(TestMergeContext()) windowfn.merge(TestMergeContext()) return sorted(running) self.assertEqual([IntervalWindow(2, 12)], merge(2)) self.assertEqual([IntervalWindow(2, 12), IntervalWindow(19, 29)], merge(2, 19)) self.assertEqual([IntervalWindow(2, 19)], merge(2, 9)) self.assertEqual([IntervalWindow(2, 19)], merge(9, 2)) self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], merge(2, 9, 19)) self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], merge(19, 9, 2)) self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10))
def test_fixed_watermark_with_early_late(self): self.run_trigger_simple( FixedWindows(100), # pyformat break AfterWatermark(early=AfterCount(3), late=AfterCount(2)), AccumulationMode.DISCARDING, zip(range(9), 'abcdefghi'), {IntervalWindow(0, 100): [ set('abcd'), set('efgh'), # early set('i'), # on time set('vw'), set('xy') # late ]}, 2, late_data=zip(range(5), 'vwxyz'))
def merge(self, merge_context): to_merge = [] end = MIN_TIMESTAMP _logger.info("%d windows" % len(merge_context.windows)) for w in sorted(merge_context.windows, key=lambda w: w.start): _logger.info("WINDOW: (%s, %s)" % (format_timestamp(w.start), format_timestamp(w.end))) if to_merge: if end > w.start: # window `w` overlaps with `to_merge`: add it to_merge.append(w) if w.end == MAX_TIMESTAMP: _logger.info("FINAL: (%s, %s)" % (format_timestamp( to_merge[0].start), format_timestamp(end))) # we don't want any more windows on this key end = w.start break elif w.end > end: end = w.end else: # FIXME: this check seems superfluous if len(to_merge) > 1: _logger.info("NEW: (%s, %s)" % (format_timestamp( to_merge[0].start), format_timestamp(end))) merge_context.merge( to_merge, IntervalWindow(to_merge[0].start, end)) to_merge = [w] end = w.end else: to_merge = [w] end = w.end if len(to_merge) > 1: _logger.info( "NEW: (%s, %s)" % (format_timestamp(to_merge[0].start), format_timestamp(end))) merge_context.merge(to_merge, IntervalWindow(to_merge[0].start, end))
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), {IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz')]}, 1, late_data=zip(range(3), 'xyz'))
def test_sessions_watermark(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (15, 'c'), (16, 'd'), (30, 'z'), (9, 'e'), (10, 'f'), (30, 'y')], {IntervalWindow(1, 26): [set('abcdef')], IntervalWindow(30, 40): [set('yz')]}, 1, 2, 3, 4, 5, 6)
def test_sliding_windows_simple_watermark(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k2', 1)]) .advance_watermark_to(1) .add_elements([('k1', 2), ('k2', 2)]) .add_elements([('k1', 2), ('k2', 2)]) .advance_watermark_to(2) .add_elements([('k1', 3), ('k2', 3)]) .add_elements([('k1', 3), ('k2', 3)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(SlidingWindows(2, 1)) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(-1, 1), [1, 1, 1]), ('k2', IntervalWindow(-1, 1), [1, 1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k1', IntervalWindow(2, 4), [3, 3]), ('k2', IntervalWindow(2, 4), [3, 3]), ]))
def test_fixed_windows(self): # Test windows with offset: 2, 7, 12, 17, ... windowfn = FixedWindows(size=5, offset=2) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 7))) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11))) self.assertEqual([IntervalWindow(12, 17)], windowfn.assign(context('v', 12))) # Test windows without offset: 0, 5, 10, 15, ... windowfn = FixedWindows(size=5) self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 5))) self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 9))) self.assertEqual([IntervalWindow(10, 15)], windowfn.assign(context('v', 10))) # Test windows with offset out of range. windowfn = FixedWindows(size=5, offset=12) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
def test_with_trigger_window_that_finish(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)]) .add_elements([tsv('k1', 3, 0)]) .advance_watermark_to(2) .add_elements([tsv('k1', 6, 0)]) # This value is discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), triggerfn=AfterWatermark(), allowed_lateness=0, accumulation_mode=AccumulationMode.DISCARDING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ]))
def _execute(self, window_fn, trigger_fn, accumulation_mode, timestamp_combiner, allowed_lateness, transcript, spec): if timestamp_combiner == TimestampCombiner.OUTPUT_AT_EARLIEST_TRANSFORMED: self.skipTest('Non-fnapi timestamp combiner: %s' % spec.get('timestamp_combiner')) if accumulation_mode != AccumulationMode.ACCUMULATING: self.skipTest('Batch mode only makes sense for accumulating.') watermark = MIN_TIMESTAMP for action, params in transcript: if action == 'watermark': watermark = params elif action == 'input': if any(t <= watermark for t in params): self.skipTest('Batch mode never has late data.') inputs = sum([vs for action, vs in transcript if action == 'input'], []) final_panes_by_window = {} for action, params in transcript: if action == 'expect': for expected in params: trimmed = {} for field in ('window', 'values', 'timestamp'): if field in expected: trimmed[field] = expected[field] final_panes_by_window[tuple(expected['window'])] = trimmed final_panes = list(final_panes_by_window.values()) if window_fn.is_merging(): merged_away = set() class MergeContext(WindowFn.MergeContext): def merge(_, to_be_merged, merge_result): for window in to_be_merged: if window != merge_result: merged_away.add(window) all_windows = [ IntervalWindow(*pane['window']) for pane in final_panes ] window_fn.merge(MergeContext(all_windows)) final_panes = [ pane for pane in final_panes if IntervalWindow(*pane['window']) not in merged_away ] with TestPipeline() as p: input_pc = (p | beam.Create(inputs) | beam.Map(lambda t: TimestampedValue(('key', t), t)) | beam.WindowInto( window_fn, trigger=trigger_fn, accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner, allowed_lateness=allowed_lateness)) grouped = input_pc | 'Grouped' >> ( beam.GroupByKey() | beam.MapTuple(_windowed_value_info_map_fn) | beam.MapTuple(lambda _, value: value)) combined = input_pc | 'Combined' >> ( beam.CombinePerKey(_ConcatCombineFn()) | beam.MapTuple(_windowed_value_info_map_fn) | beam.MapTuple(lambda _, value: value)) assert_that( grouped, lambda actual: _windowed_value_info_check(actual, final_panes), label='CheckGrouped') assert_that( combined, lambda actual: _windowed_value_info_check(actual, final_panes), label='CheckCombined')
class StandardCodersTest(unittest.TestCase): _urn_to_json_value_parser = { 'beam:coder:bytes:v1': lambda x: x.encode('utf-8'), 'beam:coder:string_utf8:v1': lambda x: x, 'beam:coder:varint:v1': lambda x: x, 'beam:coder:kv:v1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'beam:coder:interval_window:v1': lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'beam:coder:iterable:v1': lambda x, parser: list(map(parser, x)), 'beam:coder:global_window:v1': lambda x: window.GlobalWindow(), 'beam:coder:windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])), 'beam:coder:timer:v1': lambda x, payload_parser: dict(payload=payload_parser(x['payload']), timestamp=Timestamp(micros=x['timestamp' ] * 1000)), 'beam:coder:double:v1': parse_float, } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): def assert_equal(actual, expected): """Handle nan values which self.assertEqual fails on.""" if (isinstance(actual, float) and isinstance(expected, float) and math.isnan(actual) and math.isnan(expected)): return self.assertEqual(actual, expected) coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) decoded = decode_nested(coder, expected_encoded, nested) assert_equal(decoded, value) else: # Only verify decoding for a non-deterministic coder self.assertEqual( decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): context = pipeline_context.PipelineContext() coder_id = str(hash(str(spec))) component_ids = [ context.coders.get_id(self.parse_coder(c)) for c in spec.get('components', ()) ] context.coders.put_proto( coder_id, beam_runner_api_pb2.Coder(spec=beam_runner_api_pb2.FunctionSpec( urn=spec['urn'], payload=spec.get('payload')), component_coder_ids=component_ids)) return context.coders.get_by_id(coder_id) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ()) ] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print("FIXING", len(cls.to_fix), "TESTS") doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print(quote(expected_encoded), "->", quote(actual_encoded)) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
def test_streaming_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(), options=StandardOptions(streaming=True)) data = ( p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(['to', 'be', 'or', 'not', 'to', 'be']) .advance_watermark_to(20) .advance_processing_time(1) .add_elements(['that', 'is', 'the', 'question']) | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable counts = (data | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Create a fake limiter that cancels the BCJ once the main job receives the # expected amount of results. class FakeLimiter: def __init__(self, p, pcoll): self.p = p self.pcoll = pcoll def is_triggered(self): result = ie.current_env().pipeline_result(self.p) if result: try: results = result.get(self.pcoll) except ValueError: return False return len(results) >= 10 return False # This sets the limiters to stop reading when the test receives 10 elements. ie.current_env().options.capture_control.set_limiters_for_test( [FakeLimiter(p, data)]) # This tests that the data was correctly cached. pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0) expected_data_df = pd.DataFrame([ ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('or', 0, [IntervalWindow(0, 10)], pane_info), ('not', 0, [IntervalWindow(0, 10)], pane_info), ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('that', 20000000, [IntervalWindow(20, 30)], pane_info), ('is', 20000000, [IntervalWindow(20, 30)], pane_info), ('the', 20000000, [IntervalWindow(20, 30)], pane_info), ('question', 20000000, [IntervalWindow(20, 30)], pane_info) ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable data_df = ib.collect(data, include_window_info=True) pd.testing.assert_frame_equal(expected_data_df, data_df) # This tests that the windowing was passed correctly so that all the data # is aggregated also correctly. pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0) expected_counts_df = pd.DataFrame([ ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable counts_df = ib.collect(counts, include_window_info=True) # The group by key has no guarantee of order. So we post-process the DF by # sorting so we can test equality. sorted_counts_df = (counts_df .sort_values(['event_time', 0], ascending=True) .reset_index(drop=True)) # yapf: disable pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)