def test_multiple_accumulating_firings(self): # PCollection will contain elements from 1 to 10. elements = [i for i in range(1, 11)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([('key', str(i))]) if i % 5 == 0: ts.advance_watermark_to(i) ts.advance_processing_time(5) options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: _ = ( p | ts | beam.WindowInto( FixedWindows(10), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark( early=AfterAll(AfterCount(1), AfterProcessingTime(5)))) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1]) | beam.ParDo(self.record_dofn())) # The trigger should fire twice. Once after 5 seconds, and once after 10. # The firings should accumulate the output. first_firing = [str(i) for i in elements if i <= 5] second_firing = [str(i) for i in elements] self.assertListEqual(first_firing + second_firing, TriggerPipelineTest.all_records)
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) def has_expected_values(actual): from hamcrest.core import assert_that as hamcrest_assert from hamcrest.library.collection import contains from hamcrest.library.collection import only_contains ordered = sorted(actual) # Early firings. hamcrest_assert(ordered[:4], contains(1, 3, 6, 10)) # Different runners have different number of 15s, but there should # be at least one 15. hamcrest_assert(ordered[4:], only_contains(15)) assert_that(result, has_expected_values)
def test_sessions_after_all(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abc')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abcxy')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_trigger_encoding(self): for trigger_fn in (DefaultTrigger(), AfterAll(AfterCount(1), AfterCount(10)), AfterAny(AfterCount(10), AfterCount(100)), AfterWatermark(early=AfterCount(1000)), AfterWatermark(early=AfterCount(1000), late=AfterCount(1)), Repeatedly(AfterCount(100)), trigger.OrFinally(AfterCount(3), AfterCount(10))): context = pipeline_context.PipelineContext() self.assertEqual( trigger_fn, TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
def run(argv=None): from apache_beam.transforms.window import TimestampedValue, FixedWindows pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images' with beam.Pipeline(options=get_pipeline_options()) as pipeline: logging.info("pubsub_input_topic = {}".format(pubsub_input_topic)) json_messages = \ (pipeline | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes) | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message) ) window_size_s = 30 allowed_lateness_s = 60 high_confidence_faces_grouped_by_emotion_count_per_window = ( json_messages | 'ParseJsonMessage' >> beam.Map(parse_jsons) | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn()) | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood) | 'UseCustomTimestamp' >> beam.Map(lambda face_info: TimestampedValue(face_info, face_info['ts_seconds'])) | 'WindowFaceInfo' >> beam.WindowInto( FixedWindows(window_size_s, 0), trigger=AfterWatermark( early=AfterAny(AfterCount(5), AfterProcessingTime(10)), late=AfterAll(AfterCount(2), AfterProcessingTime(20))), allowed_lateness=allowed_lateness_s, accumulation_mode=AccumulationMode.DISCARDING) | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info)) | 'GroupByEmotion' >> beam.GroupByKey() | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow()) ) log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery") high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( bq_faces_windowed_table_name, schema={"fields": bq_faces_windowed_table_schema}, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) pipeline_result = pipeline.run() pipeline_result.wait_until_finish()
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0]. firings = [1, 3, 6, 10, 15, 15, 15] assert_that(result, equal_to(firings))
def test_after_all_safe(self): self._test(AfterAll(Repeatedly(AfterCount(1)), DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_all_some_unsafe(self): self._test(AfterAll(AfterCount(1), DefaultTrigger()), 0, DataLossReason.MAY_FINISH)
def test_afer_all_all_may_finish(self): self._test(AfterAll(AfterCount(42), AfterProcessingTime(42)), 0, DataLossReason.MAY_FINISH)
def test_after_all_some_may_finish(self): self._test(AfterAll(AfterCount(1), DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)