def test_fixed_windows(self): # Test windows with offset: 2, 7, 12, 17, ... windowfn = FixedWindows(size=5, offset=2) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 7))) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11))) self.assertEqual([IntervalWindow(12, 17)], windowfn.assign(context('v', 12))) # Test windows without offset: 0, 5, 10, 15, ... windowfn = FixedWindows(size=5) self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 5))) self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 9))) self.assertEqual([IntervalWindow(10, 15)], windowfn.assign(context('v', 10))) # Test windows with offset out of range. windowfn = FixedWindows(size=5, offset=12) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
def test_equal_to_per_window_passes(self): start = int(MIN_TIMESTAMP.micros // 1e6) - 5 end = start + 20 expected = { window.IntervalWindow(start, end): [('k', [1])], } with TestPipeline(options=StandardOptions(streaming=True)) as p: assert_that( (p | Create([1]) | beam.WindowInto( FixedWindows(20), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()), equal_to_per_window(expected), reify_windows=True)
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), { IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz') ] }, 1, late_data=zip(range(3), 'xyz'))
def _pipeline_runner(): with beam.Pipeline(runner=DirectRunner()) as p: ts = TestStream().advance_watermark_to(0) all_elements = iter(range(size)) watermark = 0 while True: next_batch = list(itertools.islice(all_elements, 100)) if not next_batch: break ts = ts.add_elements([(i, random.randint(0, 1000)) for i in next_batch]) watermark = watermark + 100 ts = ts.advance_watermark_to(watermark) ts = ts.advance_watermark_to_infinity() input_pc = p | ts | WindowInto(FixedWindows(100)) for i in range(NUM_PARALLEL_STAGES): _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
def test_buffering_timer_in_fixed_window_streaming(self): window_duration = 6 max_buffering_duration_secs = 100 start_time = timestamp.Timestamp(0) test_stream = ( TestStream().add_elements([ TimestampedValue(value, start_time + i) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()) ]).advance_processing_time(150).advance_watermark_to( start_time + window_duration).advance_watermark_to( start_time + window_duration + 1).advance_watermark_to_infinity()) with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | "fixed window" >> WindowInto(FixedWindows(window_duration)) | util.GroupIntoBatches( GroupIntoBatchesTest.BATCH_SIZE, max_buffering_duration_secs, fake_clock) | "count elements in batch" >> Map(lambda x: (None, len(x[1]))) | "global window" >> WindowInto(GlobalWindows()) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # Window duration is 6 and batch size is 5, so output batch size # should be 5 (flush because of batch size reached). expected_0 = 5 # There is only one element left in the window so batch size # should be 1 (flush because of max buffering duration reached). expected_1 = 1 # Collection has 10 elements, there are only 4 left, so batch size should # be 4 (flush because of end of window reached). expected_2 = 4 assert_that( num_elements_per_batch, equal_to([expected_0, expected_1, expected_2]), "assert2")
def run(argv=None): from apache_beam.transforms.window import TimestampedValue, FixedWindows pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images' with beam.Pipeline(options=get_pipeline_options()) as pipeline: logging.info("pubsub_input_topic = {}".format(pubsub_input_topic)) json_messages = \ (pipeline | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes) | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message) ) window_size_s = 30 allowed_lateness_s = 60 high_confidence_faces_grouped_by_emotion_count_per_window = ( json_messages | 'ParseJsonMessage' >> beam.Map(parse_jsons) | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn()) | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood) | 'UseCustomTimestamp' >> beam.Map(lambda face_info: TimestampedValue(face_info, face_info['ts_seconds'])) | 'WindowFaceInfo' >> beam.WindowInto( FixedWindows(window_size_s, 0), trigger=AfterWatermark( early=AfterAny(AfterCount(5), AfterProcessingTime(10)), late=AfterAll(AfterCount(2), AfterProcessingTime(20))), allowed_lateness=allowed_lateness_s, accumulation_mode=AccumulationMode.DISCARDING) | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info)) | 'GroupByEmotion' >> beam.GroupByKey() | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow()) ) log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery") high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( bq_faces_windowed_table_name, schema={"fields": bq_faces_windowed_table_schema}, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) pipeline_result = pipeline.run() pipeline_result.wait_until_finish()
def test_gbk_execution_no_triggers(self): test_stream = ( TestStream().advance_watermark_to(10).add_elements([ 'a', 'b', 'c' ]).advance_watermark_to(20).add_elements(['d']).add_elements([ 'e' ]).advance_processing_time(10).advance_watermark_to(300).add_elements([ TimestampedValue('late', 12) ]).add_elements([TimestampedValue('last', 310) ]).advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = ( p | test_stream | beam.WindowInto(FixedWindows(15), allowed_lateness=300) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', ['late']), ], window.IntervalWindow(15, 30): [ ('k', ['d', 'e']), ], window.IntervalWindow(300, 315): [ ('k', ['last']), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_gbk_execution_no_triggers(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def fired_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15)) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey() | beam.Map(fired_elements)) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. assert_that(records, equal_to([ ('k', ['a', 'b', 'c']), ('k', ['d', 'e']), ('k', ['late']), ('k', ['last'])])) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([ ('k', ['a', 'b', 'c']), ('k', ['d', 'e']), ('k', ['late']), ('k', ['last'])], result)
def test_fixed_windows_simple_watermark(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0), tsv('k1', 2, 0), tsv('k2', 2, 0)]) .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)]) .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)]) .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)]) .advance_watermark_to(1) .add_elements([tsv('k1', 6, 0)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k2', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k1', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k2', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k1', IntervalWindow(0, 1), [6]), # After the watermark ]))
def test_top(self): with TestPipeline() as pipeline: timestamp = 0 # First for global combines. pcoll = pipeline | 'start' >> Create( [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | 'top' >> combine.Top.Largest(5) result_bot = pcoll | 'bot' >> combine.Top.Smallest(4) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') # Now for global combines without default timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp)) windowed = timestamped | 'window' >> WindowInto(FixedWindows(60)) result_windowed_top = windowed | 'top-wo-defaults' >> combine.Top.Largest( 5, has_defaults=False) result_windowed_bot = (windowed | 'bot-wo-defaults' >> combine.Top.Smallest( 4, has_defaults=False)) assert_that(result_windowed_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top-wo-defaults') assert_that(result_windowed_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot-wo-defaults') # Again for per-key combines. pcoll = pipeline | 'start-perkey' >> Create( [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey( 5) result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey( 4) assert_that(result_key_top, equal_to([('a', [9, 6, 6, 5, 3])]), label='key:top') assert_that(result_key_bot, equal_to([('a', [0, 1, 1, 1])]), label='key:bot')
def test_builtin_combines(self): with TestPipeline() as pipeline: vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] mean = sum(vals) / float(len(vals)) size = len(vals) timestamp = 0 # First for global combines. pcoll = pipeline | 'start' >> Create(vals) result_mean = pcoll | 'mean' >> combine.Mean.Globally() result_count = pcoll | 'count' >> combine.Count.Globally() assert_that(result_mean, equal_to([mean]), label='assert:mean') assert_that(result_count, equal_to([size]), label='assert:size') # Now for global combines without default timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp)) windowed = timestamped | 'window' >> WindowInto(FixedWindows(60)) result_windowed_mean = (windowed | 'mean-wo-defaults' >> combine.Mean.Globally().without_defaults()) assert_that(result_windowed_mean, equal_to([mean]), label='assert:mean-wo-defaults') result_windowed_count = ( windowed | 'count-wo-defaults' >> combine.Count.Globally().without_defaults()) assert_that(result_windowed_count, equal_to([size]), label='assert:count-wo-defaults') # Again for per-key combines. pcoll = pipeline | 'start-perkey' >> Create([('a', x) for x in vals]) result_key_mean = pcoll | 'mean-perkey' >> combine.Mean.PerKey() result_key_count = pcoll | 'count-perkey' >> combine.Count.PerKey() assert_that(result_key_mean, equal_to([('a', mean)]), label='key:mean') assert_that(result_key_count, equal_to([('a', size)]), label='key:size')
def run(argv=None): """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read in the CSV file lines = (p | 'ReadFromPubSub' >> ReadFromPubSub( topic=known_args.input).with_output_types(bytes) | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8')) | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn())) windows = (lines | 'WindowInto' >> beam.WindowInto(FixedWindows(30, 0)) | 'SumValues' >> beam.CombinePerKey(sum)) # Format rows and write to BigQuery. (windows | 'ConvertToDictionary' >> beam.Map(lambda row: { 'id': row[0], 'total': row[1] }) | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, total:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def test_after_count(self): p = TestPipeline() result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(lambda (k, t): TimestampedValue((k, t), t)) | beam.WindowInto( FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v)))) assert_that( result, equal_to({ 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements([TimestampedValue('a', 11)]) .advance_watermark_to(20) .add_elements([TimestampedValue('b', 21)]) .advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('k', ['a']), ('k', []) ], window.IntervalWindow(15, 30): [ ('k', ['b']), ('k', []) ], } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_with_trigger_window_that_finish(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)]) .add_elements([tsv('k1', 3, 0)]) .advance_watermark_to(2) .add_elements([tsv('k1', 6, 0)]) # This value is discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), triggerfn=AfterWatermark(), allowed_lateness=0, accumulation_mode=AccumulationMode.DISCARDING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ]))
def run_pipeline_with_micro_batches(inference_type, pubsub_topic, runner, args=None): options = beam.pipeline.PipelineOptions(flags=[], **args) pipeline = beam.Pipeline(runner, options=options) (pipeline | 'Read from PubSub' >> beam.io.ReadStringsFromPubSub(topic=pubsub_topic) | 'Micro-batch - Window Size: {} Seconds'.format(WINDOW_SIZE) >> beam.WindowInto(FixedWindows(size=WINDOW_SIZE)) | 'Estimate Targets - {}'.format(inference_type) >> beam.FlatMap(lambda messages: estimate(messages, inference_type)) | 'Write to BigQuery' >> beam.io.WriteToBigQuery( project=PROJECT, dataset=DATASET, table=TABLE, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) pipeline.run()
def test_to_set(self): pipeline = TestPipeline() the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] timestamp = 0 pcoll = pipeline | 'start' >> Create(the_list) result = pcoll | 'to set' >> combine.ToSet() # Now for global combines without default timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp)) windowed = timestamped | 'window' >> WindowInto(FixedWindows(60)) result_windowed = ( windowed | 'to set wo defaults' >> combine.ToSet().without_defaults()) def matcher(expected): def match(actual): equal_to(expected[0])(actual[0]) return match assert_that(result, matcher(set(the_list))) assert_that( result_windowed, matcher(set(the_list)), label='to-set-wo-defaults')
def test_model_setting_trigger(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements( ['a', 'a', 'a', 'b', 'b']).advance_watermark_to(70).advance_processing_time(600)) pcollection = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( pcollection | WindowInto( FixedWindows(1 * 60), trigger=AfterProcessingTime(10 * 60), accumulation_mode=AccumulationMode.DISCARDING) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2)]))
def test_to_list_and_to_dict2(self): with TestPipeline() as pipeline: pairs = [(1, 2), (3, 4), (5, 6)] timestamp = 0 pcoll = pipeline | 'start-pairs' >> Create(pairs) result = pcoll | 'to dict' >> combine.ToDict() # Now for global combines without default timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp)) windowed = timestamped | 'window' >> WindowInto(FixedWindows(60)) result_windowed = ( windowed | 'to dict wo defaults' >> combine.ToDict().without_defaults()) def matcher(): def match(actual): equal_to([1])([len(actual)]) equal_to(pairs)(actual[0].items()) return match assert_that(result, matcher()) assert_that(result_windowed, matcher(), label='to-dict-wo-defaults')
def test_after_count(self): with TestPipeline() as p: def construct_timestamped(k_t): return TimestampedValue((k_t[0], k_t[1]), k_t[1]) def format_result(k_v): return ('%s-%s' % (k_v[0], len(k_v[1])), set(k_v[1])) result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(construct_timestamped) | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(format_result)) assert_that(result, equal_to( { 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_watermark_to(20)) # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def fired_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey() | beam.Map(fired_elements)) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed. # assert_that(records, equal_to([ # ('k', ['a']), ('k', [])])) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('k', ['a']), ('k', [])], result)
def test_global_sample(self): def is_good_sample(actual): assert len(actual) == 1 assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual with TestPipeline() as pipeline: timestamp = 0 pcoll = pipeline | 'start' >> Create([1, 1, 2, 2]) # Now for global combines without default timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp)) windowed = timestamped | 'window' >> WindowInto(FixedWindows(60)) for ix in range(9): assert_that( pcoll | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3), is_good_sample, label='check-%d' % ix) result_windowed = ( windowed | 'sample-wo-defaults-%d' % ix >> combine.Sample.FixedSizeGlobally(3).without_defaults()) assert_that( result_windowed, is_good_sample, label='check-wo-defaults-%d' % ix)
def test_streaming_different_file_types(self): dir = self._new_tempdir() input = iter(WriteFilesTest.SIMPLE_COLLECTION) ts = (TestStream().advance_watermark_to(0).add_elements( [next(input), next(input)]).advance_watermark_to(10).add_elements( [next(input), next(input)]).advance_watermark_to(20).add_elements([ next(input), next(input) ]).advance_watermark_to(30).add_elements([ next(input), next(input) ]).advance_watermark_to(40).advance_watermark_to_infinity()) def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') with TestPipeline() as p: _ = (p | ts | beam.WindowInto(FixedWindows(10)) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: (WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=no_colon_file_naming, max_writers_per_bundle=0, )) with TestPipeline() as p: cncf_files = (p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | "CncfFileNames" >> beam.Map(lambda fm: fm.path)) apache_files = (p | "MatchApache" >> fileio.MatchFiles( FileSystems.join(dir, 'apache*')) | "ApacheFileNames" >> beam.Map(lambda fm: fm.path)) assert_that( cncf_files, matches_all([ stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40--.*') ]), label='verifyCNCFFiles') assert_that( apache_files, matches_all([ stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40--.*') ]), label='verifyApacheFiles')
def test_streaming_complex_timing(self): # Use state on the TestCase class, since other references would be pickled # into a closure and not have the desired side effects. # # TODO(BEAM-5295): Use assert_that after it works for the cases here in # streaming mode. WriteFilesTest.all_records = [] dir = '%s%s' % (self._new_tempdir(), os.sep) # Setting up the input (TestStream) ts = TestStream().advance_watermark_to(0) for elm in WriteFilesTest.LARGER_COLLECTION: timestamp = int(elm) ts.add_elements([('key', '%s' % elm)]) if timestamp % 5 == 0 and timestamp != 0: # TODO(BEAM-3759): Add many firings per window after getting PaneInfo. ts.advance_processing_time(5) ts.advance_watermark_to(timestamp) ts.advance_watermark_to_infinity() def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') # The pipeline that we are testing options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: res = (p | ts | beam.WindowInto( FixedWindows(10), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1])) # Triggering after 5 processing-time seconds, and on the watermark. Also # discarding old elements. _ = (res | beam.io.fileio.WriteToFiles( path=dir, file_naming=no_colon_file_naming, max_writers_per_bundle=0) | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name)) | beam.ParDo(self.record_dofn())) # Verification pipeline with TestPipeline() as p: files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*'))) file_names = (files | beam.Map(lambda fm: fm.path)) file_contents = ( files | beam.io.fileio.ReadMatches() | beam.Map(lambda rf: (rf.metadata.path, rf.read_utf8().strip( ).split('\n')))) content = (file_contents | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]])) assert_that(file_names, equal_to(WriteFilesTest.all_records), label='AssertFilesMatch') assert_that(content, matches_all(WriteFilesTest.LARGER_COLLECTION), label='AssertContentsMatch')
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(is_integration_test=True, options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def test_fixed_windows(self): # Test windows with offset: 2, 7, 12, 17, ... windowfn = FixedWindows(size=5, offset=2) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 7))) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11))) self.assertEqual([IntervalWindow(12, 17)], windowfn.assign(context('v', 12))) # Test windows without offset: 0, 5, 10, 15, ... windowfn = FixedWindows(size=5) self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 5))) self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 9))) self.assertEqual([IntervalWindow(10, 15)], windowfn.assign(context('v', 10))) # Test windows with offset out of range. windowfn = FixedWindows(size=5, offset=12) self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
def run_pipeline_with_micro_batches(inference_type, project, pubsub_topic, pubsub_subscription, bq_dataset, bq_table, window_size, runner, args=None): prepare_steaming_source(project, pubsub_topic, pubsub_subscription) prepare_steaming_sink(project, bq_dataset, bq_table) pubsub_subscription_url = "projects/{}/subscriptions/{}".format(project, pubsub_subscription) options = beam.pipeline.PipelineOptions(flags=[], **args) pipeline = beam.Pipeline(runner, options=options) ( pipeline | 'Read from PubSub' >> beam.io.ReadStringsFromPubSub(subscription=pubsub_subscription_url, id_label="source_id") | 'Micro-batch - Window Size: {} Seconds'.format(window_size) >> beam.WindowInto(FixedWindows(size=window_size)) | 'Estimate Targets - {}'.format(inference_type) >> beam.FlatMap(lambda messages: estimate(messages, inference_type)) | 'Write to BigQuery' >> beam.io.WriteToBigQuery(project=project, dataset=bq_dataset, table=bq_table ) ) pipeline.run()
def expand(self, input): return (input | WindowInto(FixedWindows(self.duration)) | ExtractAndSumScore("team"))
def expand(self, pcoll): return ( pcoll # Bind window info to each element using element timestamp (or publish time). | 'window' >> beam.WindowInto(FixedWindows(self.window_size)))
def test_windowing(self): test_stream = (TestStream() .advance_watermark_to(0) .add_elements(['a', 'b', 'c']) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_watermark_to(5) .add_elements(['1', '2', '3']) .advance_processing_time(1) .advance_watermark_to(6) .advance_processing_time(1) .advance_watermark_to(7) .advance_processing_time(1) .advance_watermark_to(8) .advance_processing_time(1) .advance_watermark_to(9) .advance_processing_time(1) .advance_watermark_to(10) .advance_processing_time(1) .advance_watermark_to(11) .advance_processing_time(1) .advance_watermark_to(12) .advance_processing_time(1) .advance_watermark_to(13) .advance_processing_time(1) .advance_watermark_to(14) .advance_processing_time(1) .advance_watermark_to(15) .advance_processing_time(1) ) # yapf: disable options = StandardOptions(streaming=True) p = TestPipeline(options=options) records = (p | test_stream | 'letter windows' >> beam.WindowInto( FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey() | ReverseTestStream(sample_resolution_sec=1, output_tag=None)) assert_that( records, equal_to_per_window({ beam.window.GlobalWindow(): [ [ProcessingTimeEvent(5), WatermarkEvent(4999998)], [ ElementEvent([ TimestampedValue(('k', ['a', 'b', 'c']), 4.999999) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(5000000)], [ProcessingTimeEvent(1), WatermarkEvent(6000000)], [ProcessingTimeEvent(1), WatermarkEvent(7000000)], [ProcessingTimeEvent(1), WatermarkEvent(8000000)], [ProcessingTimeEvent(1), WatermarkEvent(9000000)], [ ElementEvent([ TimestampedValue(('k', ['1', '2', '3']), 9.999999) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(10000000)], [ProcessingTimeEvent(1), WatermarkEvent(11000000)], [ProcessingTimeEvent(1), WatermarkEvent(12000000)], [ProcessingTimeEvent(1), WatermarkEvent(13000000)], [ProcessingTimeEvent(1), WatermarkEvent(14000000)], [ProcessingTimeEvent(1), WatermarkEvent(15000000)], ], })) p.run()