def test_model_other_composite_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements( ['a', 'a']).add_elements( ['a', 'b', 'b']).advance_processing_time(60).add_elements(['a'] * 100)) pcollection = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( # [START model_other_composite_triggers] pcollection | WindowInto( FixedWindows(1 * 60), trigger=Repeatedly( AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))), accumulation_mode=AccumulationMode.DISCARDING) # [END model_other_composite_triggers] | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
def test_fixed_after_count_accumulating(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k1', 1)]) .advance_watermark_to(2) .add_elements([('k1', 2), ('k2', 2)]) # This values are discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(2), triggerfn=Repeatedly(AfterCount(2)), accumulation_mode=AccumulationMode.ACCUMULATING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 2), [1, 1]), ('k2', IntervalWindow(0, 2), [1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]), ]))
def test_sessions_repeatedly_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('abcde')]}, 1, 3) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.DISCARDING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('de')]}, 1, 3)
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value( (key, value), timestamp) for (value, timestamp) in values ] else: def reify_timestamps(element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [ wv.with_value((key, wv.value)) for wv in windowed_values ] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Repeatedly(AfterCount(1)), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def test_trigger_encoding(self): for trigger_fn in (DefaultTrigger(), AfterAll(AfterCount(1), AfterCount(10)), AfterAny(AfterCount(10), AfterCount(100)), AfterWatermark(early=AfterCount(1000)), AfterWatermark(early=AfterCount(1000), late=AfterCount(1)), Repeatedly(AfterCount(100)), trigger.OrFinally(AfterCount(3), AfterCount(10))): context = pipeline_context.PipelineContext() self.assertEqual( trigger_fn, TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), {IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz')]}, 1, late_data=zip(range(3), 'xyz'))
def test_sessions_after_each(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterEach(AfterCount(2), AfterCount(3)), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')]}, 2) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterEach(AfterCount(2), AfterCount(3))), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')], IntervalWindow(0, 17): [set('abcdefgh')]}, 2)
def run(argv=None): """Build and run the pipeline.""" args = ["--runner=PortableRunner", "--job_endpoint=localhost:8099", "--streaming"] if argv: args.extend(argv) parser = argparse.ArgumentParser() parser.add_argument('--count', dest='count', default=0, help='Number of triggers to generate ' '(0 means emit forever).') parser.add_argument('--interval_ms', dest='interval_ms', default=500, help='Interval between records per parallel ' 'Flink subtask.') known_args, pipeline_args = parser.parse_known_args(args) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) messages = (p | FlinkStreamingImpulseSource() .set_message_count(known_args.count) .set_interval_ms(known_args.interval_ms)) _ = (messages | 'decode' >> beam.Map(lambda x: ('', 1)) | 'window' >> beam.WindowInto(window.GlobalWindows(), trigger=Repeatedly( AfterProcessingTime(5 * 1000)), accumulation_mode= AccumulationMode.DISCARDING) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count) | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1]))) result = p.run() result.wait_until_finish()
def test_buffering_timer_in_global_window_streaming(self): max_buffering_duration_secs = 42 start_time = timestamp.Timestamp(0) test_stream = TestStream().advance_watermark_to(start_time) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()): test_stream.add_elements( [TimestampedValue(value, start_time + i)]) \ .advance_processing_time(5) test_stream.advance_watermark_to( start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \ .advance_watermark_to_infinity() with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # Set a batch size larger than the total number of elements. # Since we're in a global window, we would have been waiting # for all the elements to arrive without the buffering time limit. batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2 # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). Since the fake clock never really advances during # the pipeline execution, meaning that the timer is always set to the same # value, the timer will be fired on every element after the first firing. fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | WindowInto( GlobalWindows(), trigger=Repeatedly(AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | util.GroupIntoBatches( batch_size, max_buffering_duration_secs, fake_clock) | 'count elements in batch' >> Map(lambda x: (None, len(x[1]))) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # We will flush twice when the max buffering duration is reached and when # the global window ends. assert_that(num_elements_per_batch, equal_to([9, 1]))
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def test_repeatedly_unsafe_underlying(self): self._test(Repeatedly(AfterCount(42)), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_repeatedly_condition_underlying(self): self._test(Repeatedly(AfterCount(2)), 0, DataLossReason.NO_POTENTIAL_LOSS)
return [(Store_id, Store_location, Product_id, Product_category, sold_unit, buy_rate, sell_price, profit, transaction_date)] #############Create Pipeline ########### stream_data = ( p | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern) | 'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip()) | 'Split Data ' >> beam.Map(lambda row: row.decode().split(',')) | 'Calculate Profit' >> beam.Map(calculateProfit) | 'Apply custom timestamp' >> beam.Map(custom_timestamp) | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1])) | 'Set Fixed Window of 30 sec' >> beam.WindowInto( window.FixedWindows(30), trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))), accumulation_mode=AccumulationMode.DISCARDING) | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum) | 'Format result and append time' >> beam.ParDo(BuildRecordFn()) | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict) #|'Write to Text'>>beam.io.WriteToText(outputs_prefix) | 'Write to BigQuery' >> beam.io.WriteToBigQuery( table='sales', dataset='beam', project='beam-290211')) p.run().wait_until_finish() if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) run()
def test_after_any_all_safe(self): self._test(AfterAny(Repeatedly(AfterCount(42)), DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_each_all_safe(self): self._test(AfterEach(Repeatedly(AfterCount(1)), DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_any_different_reasons(self): self._test( AfterAny(Repeatedly(AfterCount(2)), AfterProcessingTime()), 0, DataLossReason.MAY_FINISH | DataLossReason.CONDITION_NOT_GUARANTEED)
def test_repeatedly_condition_underlying(self): self._test(Repeatedly(AfterCount(2)), 0, DataLossReason.CONDITION_NOT_GUARANTEED)
def test_repeatedly_may_finish_underlying(self): self._test(Repeatedly(AfterCount(1)), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_repeatedly_safe_underlying(self): self._test(Repeatedly(DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)