def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. p = TestPipeline(additional_pipeline_args=[ '--experiments=' + 'passthrough_pcollection_output_ids' ]) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_basic_execution_sideinputs(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: test_stream = (p | TestStream() .advance_watermark_to(0, tag='side') .advance_watermark_to(10, tag='main') .add_elements(['e'], tag='main') .add_elements([window.TimestampedValue(2, 2)], tag='side') .add_elements([window.TimestampedValue(1, 1)], tag='side') .add_elements([window.TimestampedValue(7, 7)], tag='side') .add_elements([window.TimestampedValue(4, 4)], tag='side') ) # yapf: disable main_stream = test_stream['main'] side_stream = test_stream['side'] class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])]))
def test_combiner_latest(self): """Test TimestampCombiner with LATEST.""" options = PipelineOptions(streaming=True) with TestPipeline(options=options) as p: result = ( p | TestStream().add_elements([ window.TimestampedValue(('k', 100), 2) ]).add_elements([window.TimestampedValue( ('k', 400), 7)]).advance_watermark_to_infinity() | beam.WindowInto( window.FixedWindows(10), timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST) | beam.CombinePerKey(sum)) records = ( result | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # All the KV pairs are applied GBK using LATEST timestamp for # the same key. expected_window_to_elements = { window.IntervalWindow(0, 10): [ (('k', 500), Timestamp(7)), ], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window')
def test_globally(self): l = [ window.TimestampedValue(3, 100), window.TimestampedValue(1, 200), window.TimestampedValue(2, 300) ] with TestPipeline() as p: # Map(lambda x: x) PTransform is added after Create here, because when # a PCollection of TimestampedValues is created with Create PTransform, # the timestamps are not assigned to it. Adding a Map forces the # PCollection to go through a DoFn so that the PCollection consists of # the elements with timestamps assigned to them instead of a PCollection # of TimestampedValue(element, timestamp). pcoll = p | Create(l) | Map(lambda x: x) latest = pcoll | combine.Latest.Globally() assert_that(latest, equal_to([2])) # Now for global combines without default windowed = pcoll | 'window' >> WindowInto(FixedWindows(180)) result_windowed = ( windowed | 'latest wo defaults' >> combine.Latest.Globally().without_defaults()) assert_that(result_windowed, equal_to([3, 2]), label='latest-wo-defaults')
def test_per_key(self): l = [window.TimestampedValue(('a', 1), 300), window.TimestampedValue(('b', 3), 100), window.TimestampedValue(('a', 2), 200)] with TestPipeline() as p: pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.PerKey() assert_that(latest, equal_to([('a', 1), ('b', 3)]))
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" options = StandardOptions(streaming=True) p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_basic_execution_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) test_stream = (p | TestStream() .advance_watermark_to(12, tag='side') .add_elements([window.TimestampedValue('s1', 10)], tag='side') .advance_watermark_to(20, tag='side') .add_elements([window.TimestampedValue('s2', 20)], tag='side') .advance_watermark_to(9, tag='main') .add_elements(['a1', 'a2', 'a3', 'a4'], tag='main') .add_elements(['b'], tag='main') .advance_watermark_to(18, tag='main') .add_elements('c', tag='main') ) # yapf: disable main_stream = ( test_stream['main'] | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1))) side_stream = ( test_stream['side'] | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3))) class RecordFn(beam.DoFn): def process( self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) # assert per window expected_window_to_elements = { window.IntervalWindow(9, 10): [ ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']), ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']), ('b', Timestamp(9), ['s1']) ], window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])], } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_basic_execution_sideinputs_fixed_windows(self): # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def recorded_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(9) .add_elements(['a1', 'a2', 'a3', 'a4']) .add_elements(['b']) .advance_watermark_to(18) .add_elements('c') | 'main windowInto' >> beam.WindowInto( window.FixedWindows(1)) ) side_stream = (p | 'side TestStream' >> TestStream() .advance_watermark_to(12) .add_elements([window.TimestampedValue('s1', 10)]) .advance_watermark_to(20) .add_elements([window.TimestampedValue('s2', 20)]) | 'side windowInto' >> beam.WindowInto( window.FixedWindows(3)) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)) | beam.Map(recorded_elements)) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']), ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']), ('b', Timestamp(9), ['s1']), ('c', Timestamp(18), ['s2'])], result)
def test_globally(self): l = [window.TimestampedValue(3, 100), window.TimestampedValue(1, 200), window.TimestampedValue(2, 300)] with TestPipeline() as p: # Map(lambda x: x) PTransform is added after Create here, because when # a PCollection of TimestampedValues is created with Create PTransform, # the timestamps are not assigned to it. Adding a Map forces the # PCollection to go through a DoFn so that the PCollection consists of # the elements with timestamps assigned to them instead of a PCollection # of TimestampedValue(element, timestamp). pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.Globally() assert_that(latest, equal_to([2]))
def generate_events(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) sub = topic.subscription(self.subscription_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.') # Read from PubSub into a PCollection. if self.args.subscription_name: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=sub.full_name) else: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=topic.full_name) raw_events = ( raw_events | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEvnetFn()) | 'timestamping' >> beam.Map(lambda e: window.TimestampedValue(e, e.date_time))) return raw_events
def main_without_pubsub(options): from rillbeam.transforms import SleepFn with beam.Pipeline(options=options) as pipe: # FIXME: still can't "fake" timestamp data like we get from pubsub... graph = ( pipe | 'start' >> beam.Create([(k, k) for k in range(5)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> beam.WindowInto(window.FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | beam.Map(lambda x_t2: window.TimestampedValue(x_t2[0], x_t2[1]))) b1 = (graph | 'AsInt' >> beam.Map(lambda x: int(x)) | 'LogInt' >> Log()) b2 = (graph | 'AsStr' >> beam.Map(lambda x: str(x)) | 'LogStr' >> Log()) b3 = (b1 | 'Sleep' >> beam.ParDo(SleepFn(), duration=0.2) | 'AsFloat' >> beam.Map(lambda x: float(x)) | 'LogFloat' >> Log()) ((b1, b2, b3) | Sync() | 'SyncLog' >> Log())
def test_stateful_set_state_clean_portably(self): class SetStateClearingStatefulDoFn(beam.DoFn): SET_STATE = SetStateSpec('buffer', VarIntCoder()) EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK) def process(self, element, set_state=beam.DoFn.StateParam(SET_STATE), emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)): _, value = element set_state.add(value) all_elements = [element for element in set_state.read()] if len(all_elements) == 5: set_state.clear() set_state.add(100) emit_timer.set(1) @on_timer(EMIT_TIMER) def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)): yield sorted(set_state.read()) with TestPipeline() as p: values = p | beam.Create([('key', 1), ('key', 2), ('key', 3), ('key', 4), ('key', 5)]) actual_values = (values | beam.Map(lambda t: window.TimestampedValue(t, 1)) | beam.WindowInto(window.FixedWindows(1)) | beam.ParDo(SetStateClearingStatefulDoFn())) assert_that(actual_values, equal_to([[100]]))
def test_pardo_side_inputs(self): def cross_product(elem, sides): for side in sides: yield elem, side with self.create_pipeline() as p: main = p | 'main' >> beam.Create(['a', 'b', 'c']) side = p | 'side' >> beam.Create(['x', 'y']) assert_that(main | beam.FlatMap(cross_product, beam.pvalue.AsList(side)), equal_to([('a', 'x'), ('b', 'x'), ('c', 'x'), ('a', 'y'), ('b', 'y'), ('c', 'y')])) # Now with some windowing. pcoll = p | beam.Create(range(10)) | beam.Map( lambda t: window.TimestampedValue(t, t)) # Intentionally choosing non-aligned windows to highlight the transition. main = pcoll | 'WindowMain' >> beam.WindowInto(window.FixedWindows(5)) side = pcoll | 'WindowSide' >> beam.WindowInto(window.FixedWindows(7)) res = main | beam.Map(lambda x, s: (x, sorted(s)), beam.pvalue.AsList(side)) assert_that( res, equal_to([ # The window [0, 5) maps to the window [0, 7). (0, range(7)), (1, range(7)), (2, range(7)), (3, range(7)), (4, range(7)), # The window [5, 10) maps to the window [7, 14). (5, range(7, 10)), (6, range(7, 10)), (7, range(7, 10)), (8, range(7, 10)), (9, range(7, 10))]), label='windowed')
def test_basic_execution_batch_sideinputs(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(10) .add_elements(['e']) .advance_watermark_to_infinity()) # yapf: disable side = (p | beam.Create([2, 1, 4]) | beam.Map(lambda t: window.TimestampedValue(t, t))) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, sorted(side)) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side))) assert_that(records, equal_to([('e', Timestamp(10), [1, 2, 4])])) p.run()
def test_pardo_windowed_side_inputs(self): with self.create_pipeline() as p: # Now with some windowing. pcoll = p | beam.Create(list( range(10))) | beam.Map(lambda t: window.TimestampedValue(t, t)) # Intentionally choosing non-aligned windows to highlight the transition. main = pcoll | 'WindowMain' >> beam.WindowInto( window.FixedWindows(5)) side = pcoll | 'WindowSide' >> beam.WindowInto( window.FixedWindows(7)) res = main | beam.Map(lambda x, s: (x, sorted(s)), beam.pvalue.AsList(side)) assert_that( res, equal_to([ # The window [0, 5) maps to the window [0, 7). (0, list(range(7))), (1, list(range(7))), (2, list(range(7))), (3, list(range(7))), (4, list(range(7))), # The window [5, 10) maps to the window [7, 14). (5, list(range(7, 10))), (6, list(range(7, 10))), (7, list(range(7, 10))), (8, list(range(7, 10))), (9, list(range(7, 10))) ]), label='windowed')
def read_from_file(self): return ( self.pipeline | 'reading_from_file' >> beam.io.ReadFromText(self.args.input) | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn()) | 'timestamping' >> beam.Map(lambda e: window.TimestampedValue(e, e.date_time)))
def assign_timevalue(v): # pcollectionのデータにタイムスタンプを付加する # 後段のwindowはこのタイムスタンプを基準に分割される # ここでは適当に乱数でタイムスタンプを入れている import apache_beam.transforms.window as window import random import time return window.TimestampedValue(v, int(time.time()) + random.randint(0, 1))
def process(self, element, init_result): bundle = element writer = self.sink.open_writer(init_result, str(uuid.uuid4())) for e in bundle[1]: # values writer.write(e) return [ window.TimestampedValue(writer.close(), timestamp.MAX_TIMESTAMP) ]
def test_deduplication_with_event_time(self): deduplicate_duration = 60 with self.create_pipeline() as p: test_stream = (TestStream(coder=coders.StrUtf8Coder( )).with_output_types(str).advance_watermark_to(0).add_elements([ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 20), window.TimestampedValue('k3', 30) ]).advance_watermark_to(30).add_elements([ window.TimestampedValue('k1', 40), window.TimestampedValue('k2', 50), window.TimestampedValue('k3', 60) ]).advance_watermark_to(deduplicate_duration).add_elements( [window.TimestampedValue('k1', 70)]).advance_watermark_to_infinity()) res = (p | test_stream | deduplicate.Deduplicate( event_time_duration=Duration(deduplicate_duration)) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) assert_that( res, equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)), ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
def test_windowing(self): with self.create_pipeline() as p: res = (p | beam.Create([1, 2, 100, 101, 102]) | beam.Map(lambda t: window.TimestampedValue(('k', t), t)) | beam.WindowInto(beam.transforms.window.Sessions(10)) | beam.GroupByKey() | beam.Map(lambda k_vs1: (k_vs1[0], sorted(k_vs1[1])))) assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
def _run_pardo_state_timers(self, windowed): state_spec = userstate.BagStateSpec('state', beam.coders.StrUtf8Coder()) timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK) elements = list('abcdefgh') buffer_size = 3 class BufferDoFn(beam.DoFn): def process(self, kv, ts=beam.DoFn.TimestampParam, timer=beam.DoFn.TimerParam(timer_spec), state=beam.DoFn.StateParam(state_spec)): _, element = kv state.add(element) buffer = state.read() # For real use, we'd keep track of this size separately. if len(list(buffer)) >= 3: state.clear() yield buffer else: timer.set(ts + 1) @userstate.on_timer(timer_spec) def process_timer(self, state=beam.DoFn.StateParam(state_spec)): buffer = state.read() state.clear() yield buffer def is_buffered_correctly(actual): # Pickling self in the closure for asserts gives errors (only on jenkins). self = FnApiRunnerTest('__init__') # Acutal should be a grouping of the inputs into batches of size # at most buffer_size, but the actual batching is nondeterministic # based on ordering and trigger firing timing. self.assertEqual(sorted(sum((list(b) for b in actual), [])), elements) self.assertEqual(max(len(list(buffer)) for buffer in actual), buffer_size) if windowed: # Elements were assigned to windows based on their parity. # Assert that each grouping consists of elements belonging to the # same window to ensure states and timers were properly partitioned. for b in actual: parity = set(ord(e) % 2 for e in b) self.assertEqual(1, len(parity), b) with self.create_pipeline() as p: actual = ( p | beam.Create(elements) # Send even and odd elements to different windows. | beam.Map(lambda e: window.TimestampedValue(e, ord(e) % 2)) | beam.WindowInto(window.FixedWindows(1) if windowed else window.GlobalWindows()) | beam.Map(lambda x: ('key', x)) | beam.ParDo(BufferDoFn())) assert_that(actual, is_buffered_correctly)
def test_basic_execution_sideinputs(self): # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def recorded_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(10) .add_elements(['e'])) side_stream = (p | 'side TestStream' >> TestStream() .add_elements([window.TimestampedValue(2, 2)]) .add_elements([window.TimestampedValue(1, 1)]) .add_elements([window.TimestampedValue(7, 7)]) .add_elements([window.TimestampedValue(4, 4)]) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)) | beam.Map(recorded_elements)) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('e', Timestamp(10), [2, 1, 7, 4])], result)
def test_basic_execution_sideinputs(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(10) .add_elements(['e'])) side_stream = (p | 'side TestStream' >> TestStream() .add_elements([window.TimestampedValue(2, 2)]) .add_elements([window.TimestampedValue(1, 1)]) .add_elements([window.TimestampedValue(7, 7)]) .add_elements([window.TimestampedValue(4, 4)]) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('e', Timestamp(10), [2, 1, 7, 4]), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), custom_windowing=window.FixedWindows(15), label='assert per window') assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])])) p.run()
def _finalize_write(_, sink, init_result, write_results, min_shards): write_results = list(write_results) extra_shards = [] if len(write_results) < min_shards: logging.debug( 'Creating %s empty shard(s).', min_shards - len(write_results)) for _ in range(min_shards - len(write_results)): writer = sink.open_writer(init_result, str(uuid.uuid4())) extra_shards.append(writer.close()) outputs = sink.finalize_write(init_result, write_results + extra_shards) if outputs: return (window.TimestampedValue(v, window.MAX_TIMESTAMP) for v in outputs)
def _success_write(self, _, outputs): """ Writes a success file to the final dir :param sink: apache_beam.io.filebasedsink.FileBasedSink :return: """ main_dir = os.path.dirname(self.sink.file_path_prefix.get()) success_filename = '/'.join([main_dir, '_SUCCESS']) FileSystems.create(success_filename, 'text/plain').close() for v in outputs: yield v yield window.TimestampedValue(success_filename, window.MAX_TIMESTAMP)
def test_windowed_batches(self): # Assumes a single bundle, in order... with TestPipeline() as p: res = ( p | beam.Create(range(47)) | beam.Map(lambda t: window.TimestampedValue(t, t)) | beam.WindowInto(window.FixedWindows(30)) | util.BatchElements( min_batch_size=5, max_batch_size=10, clock=FakeClock()) | beam.Map(len)) assert_that(res, equal_to([ 5, 5, 10, 10, # elements in [0, 30) 10, 7, # elements in [30, 47) ]))
def test_deduplication_in_different_windows(self): with self.create_pipeline() as p: test_stream = ( TestStream( coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements( [ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 10), window.TimestampedValue('k3', 20), window.TimestampedValue('k1', 30), window.TimestampedValue('k2', 40), window.TimestampedValue('k3', 50), window.TimestampedValue('k4', 60), window.TimestampedValue('k5', 70), window.TimestampedValue('k6', 80) ]).advance_watermark_to_infinity()) res = ( p | test_stream | beam.WindowInto(window.FixedWindows(30)) | deduplicate.Deduplicate(processing_time_duration=10 * 60) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # Deduplication should happen per window. expect_unique_keys_per_window = { window.IntervalWindow(0, 30): [('k1', Timestamp(0)), ('k2', Timestamp(10)), ('k3', Timestamp(20))], window.IntervalWindow(30, 60): [('k1', Timestamp(30)), ('k2', Timestamp(40)), ('k3', Timestamp(50))], window.IntervalWindow(60, 90): [('k4', Timestamp(60)), ('k5', Timestamp(70)), ('k6', Timestamp(80))], } assert_that( res, equal_to_per_window(expect_unique_keys_per_window), use_global_window=False, label='assert per window')
def test_basic_execution_batch_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = ( p | 'main TestStream' >> TestStream().advance_watermark_to(2).add_elements( ['a']).advance_watermark_to(4).add_elements( ['b']).advance_watermark_to_infinity() | 'main window' >> beam.WindowInto(window.FixedWindows(1))) side = ( p | beam.Create([2, 1, 4]) | beam.Map(lambda t: window.TimestampedValue(t, t)) | beam.WindowInto(window.FixedWindows(2))) class RecordFn(beam.DoFn): def process( self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side))) # assert per window expected_window_to_elements = { window.IntervalWindow(2, 3): [('a', Timestamp(2), [2])], window.IntervalWindow(4, 5): [('b', Timestamp(4), [4])] } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def run_windowed_side_inputs(self, elements, main_window_fn, side_window_fn=None, side_input_type=beam.pvalue.AsList, combine_fn=None, expected=None): with self.create_pipeline() as p: pcoll = p | beam.Create(elements) | beam.Map( lambda t: window.TimestampedValue(t, t)) main = pcoll | 'WindowMain' >> beam.WindowInto(main_window_fn) side = pcoll | 'WindowSide' >> beam.WindowInto( side_window_fn or main_window_fn) kw = {} if combine_fn is not None: side |= beam.CombineGlobally(combine_fn).without_defaults() kw['default_value'] = 0 elif side_input_type == beam.pvalue.AsDict: side |= beam.Map(lambda x: ('k%s' % x, 'v%s' % x)) res = main | beam.Map(lambda x, s: (x, s), side_input_type(side, **kw)) if side_input_type in (beam.pvalue.AsIter, beam.pvalue.AsList): res |= beam.Map(lambda (x, s): (x, sorted(s))) assert_that(res, equal_to(expected))
def test_sessions_combine(self): with TestPipeline() as p: input = ( p | beam.Create([('c', 1), ('c', 9), ('c', 12), ('d', 2), ('d', 4)]) | beam.MapTuple(lambda k, v: window.TimestampedValue((k, v), v)) | beam.WindowInto(window.Sessions(4))) global_sum = (input | beam.Values() | beam.CombineGlobally(sum).without_defaults()) sum_per_key = input | beam.CombinePerKey(sum) # The first window has 3 elements: ('c', 1), ('d', 2), ('d', 4). # The second window has 2 elements: ('c', 9), ('c', 12). assert_that(global_sum, equal_to([7, 21]), label='global sum') assert_that(sum_per_key, equal_to([('c', 1), ('c', 21), ('d', 6)]), label='sum per key')