示例#1
0
文件: window_test.py 项目: ziel/beam
 def test_windowing_encoding(self):
   for windowing in (
       Windowing(GlobalWindows()),
       Windowing(FixedWindows(1, 3), AfterCount(6),
                 accumulation_mode=AccumulationMode.ACCUMULATING),
       Windowing(SlidingWindows(10, 15, 21), AfterCount(28),
                 timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST,
                 accumulation_mode=AccumulationMode.DISCARDING)):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         windowing,
         Windowing.from_runner_api(windowing.to_runner_api(context), context))
示例#2
0
 def test_trigger_encoding(self):
   for trigger_fn in (DefaultTrigger(),
                      AfterAll(AfterCount(1), AfterCount(10)),
                      AfterAny(AfterCount(10), AfterCount(100)),
                      AfterWatermark(early=AfterCount(1000)),
                      AfterWatermark(early=AfterCount(1000),
                                     late=AfterCount(1)),
                      Repeatedly(AfterCount(100)),
                      trigger.OrFinally(AfterCount(3), AfterCount(10))):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         trigger_fn,
         TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
示例#3
0
 def test_fixed_watermark_with_early_late(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       AfterWatermark(early=AfterCount(3),
                      late=AfterCount(2)),
       AccumulationMode.DISCARDING,
       zip(range(9), 'abcdefghi'),
       {IntervalWindow(0, 100): [
           set('abcd'), set('efgh'),  # early
           set('i'),                  # on time
           set('vw'), set('xy')       # late
           ]},
       2,
       late_data=zip(range(5), 'vwxyz'))
示例#4
0
 def test_fixed_watermark_with_early(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(early=AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab'), set('abc')]},
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(early=AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc'), set('abc')]},
         3)
示例#5
0
文件: util.py 项目: samwagg/beam
    def expand(self, pcoll):
        class ReifyTimestamps(DoFn):
            def process(self, element, timestamp=DoFn.TimestampParam):
                yield element[0], TimestampedValue(element[1], timestamp)

        class RestoreTimestamps(DoFn):
            def process(self, element, window=DoFn.WindowParam):
                # Pass the current window since _IdentityWindowFn wouldn't know how
                # to generate it.
                yield windowed_value.WindowedValue(
                    (element[0], element[1].value), element[1].timestamp,
                    [window])

        windowing_saved = pcoll.windowing
        # The linter is confused.
        # pylint: disable=abstract-class-instantiated
        result = (
            pcoll
            | ParDo(ReifyTimestamps())
            | 'IdentityWindow' >> WindowInto(
                _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()),
                trigger=AfterCount(1),
                accumulation_mode=AccumulationMode.DISCARDING,
                timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST,
            )
            | GroupByKey()
            | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value)
                                                     for value in e[1]])
            | ParDo(RestoreTimestamps()))
        result._windowing = windowing_saved
        return result
示例#6
0
    def test_multiple_accumulating_firings(self):
        # PCollection will contain elements from 1 to 10.
        elements = [i for i in range(1, 11)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([('key', str(i))])
            if i % 5 == 0:
                ts.advance_watermark_to(i)
                ts.advance_processing_time(5)

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            _ = (
                p
                | ts
                | beam.WindowInto(
                    FixedWindows(10),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(
                        early=AfterAll(AfterCount(1), AfterProcessingTime(5))))
                | beam.GroupByKey()
                | beam.FlatMap(lambda x: x[1])
                | beam.ParDo(self.record_dofn()))

        # The trigger should fire twice. Once after 5 seconds, and once after 10.
        # The firings should accumulate the output.
        first_firing = [str(i) for i in elements if i <= 5]
        second_firing = [str(i) for i in elements]
        self.assertListEqual(first_firing + second_firing,
                             TriggerPipelineTest.all_records)
    def test_fixed_after_count_accumulating(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k1', 1)])
              .advance_watermark_to(2)
              .add_elements([('k1', 2), ('k2', 2)])  # This values are discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(2),
                              triggerfn=Repeatedly(AfterCount(2)),
                              accumulation_mode=AccumulationMode.ACCUMULATING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 2), [1, 1]),
                    ('k2', IntervalWindow(0, 2), [1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]),
                ]))
示例#8
0
    def test_after_count_streaming(self):
        test_options = PipelineOptions(
            flags=['--allow_unsafe_triggers', '--streaming'])
        with TestPipeline(options=test_options) as p:
            # yapf: disable
            test_stream = (
                TestStream()
                .advance_watermark_to(0)
                .add_elements([('A', 1), ('A', 2), ('A', 3)])
                .add_elements([('A', 4), ('A', 5), ('A', 6)])
                .add_elements([('B', 1), ('B', 2), ('B', 3)])
                .advance_watermark_to_infinity())
            # yapf: enable

            results = (p
                       | test_stream
                       | beam.WindowInto(
                           FixedWindows(10),
                           trigger=AfterCount(3),
                           accumulation_mode=AccumulationMode.ACCUMULATING)
                       | beam.GroupByKey())

            assert_that(
                results,
                equal_to(
                    list({
                        'A': [1, 2,
                              3],  # 4 - 6 discarded because trigger finished
                        'B': [1, 2, 3]
                    }.items())))
示例#9
0
  def test_model_early_late_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements([
              'a', 'a', 'a', 'b', 'b'
          ]).add_elements([
              TimestampedValue('a', 10)
          ]).advance_watermark_to(20).advance_processing_time(60).add_elements(
              [TimestampedValue('a', 10)]))
      trigger = (
          # [START model_early_late_triggers]
          AfterWatermark(
              early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1))
          # [END model_early_late_triggers]
      )
      counts = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
          | WindowInto(
              FixedWindows(15),
              trigger=trigger,
              allowed_lateness=20,
              accumulation_mode=AccumulationMode.DISCARDING)
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
示例#10
0
    def test_after_count(self):
        with TestPipeline() as p:

            def construct_timestamped(k_t):
                return TimestampedValue((k_t[0], k_t[1]), k_t[1])

            def format_result(k_v):
                return ('%s-%s' % (k_v[0], len(k_v[1])), set(k_v[1]))

            result = (p
                      | beam.Create([1, 2, 3, 4, 5, 10, 11])
                      | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
                      | beam.Map(construct_timestamped)
                      | beam.WindowInto(
                          FixedWindows(10),
                          trigger=AfterCount(3),
                          accumulation_mode=AccumulationMode.DISCARDING)
                      | beam.GroupByKey()
                      | beam.Map(format_result))
            assert_that(
                result,
                equal_to(
                    list({
                        'A-5': {1, 2, 3, 4, 5},
                        # A-10, A-11 never emitted due to AfterCount(3) never firing.
                        'B-4': {6, 7, 8, 9},
                        'B-3': {10, 15, 16},
                    }.items())))
示例#11
0
  def test_combining_with_accumulation_mode_and_fanout(self):
    # PCollection will contain elements from 1 to 5.
    elements = [i for i in range(1, 6)]

    ts = TestStream().advance_watermark_to(0)
    for i in elements:
      ts.add_elements([i])
    ts.advance_watermark_to_infinity()

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    with TestPipeline(options=options) as p:
      result = (
          p
          | ts
          | beam.WindowInto(
              GlobalWindows(),
              accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
              trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
          | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

      def has_expected_values(actual):
        from hamcrest.core import assert_that as hamcrest_assert
        from hamcrest.library.collection import contains
        from hamcrest.library.collection import only_contains
        ordered = sorted(actual)
        # Early firings.
        hamcrest_assert(ordered[:4], contains(1, 3, 6, 10))
        # Different runners have different number of 15s, but there should
        # be at least one 15.
        hamcrest_assert(ordered[4:], only_contains(15))

      assert_that(result, has_expected_values)
示例#12
0
  def test_model_other_composite_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements(
              ['a', 'a']).add_elements(
                  ['a', 'b',
                   'b']).advance_processing_time(60).add_elements(['a'] * 100))
      pcollection = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_other_composite_triggers]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=Repeatedly(
                  AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_other_composite_triggers]
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
示例#13
0
 def test_fixed_after_count(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterCount(2),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')],
         {IntervalWindow(0, 10): [set('ab')]},
         1,
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterCount(2),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')],
         {IntervalWindow(0, 10): [set('abc')]},
         3,
         4)
示例#14
0
 def test_sessions_repeatedly_after_count(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('abcde')]},
         1,
         3)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.DISCARDING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('de')]},
         1,
         3)
示例#15
0
 def test_sessions_after_all(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abc')]},
         1,
         2)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abcxy')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
示例#16
0
 def expand(self, events):
     return (events
             | beam.WindowInto(
                 FixedWindows(1 * 24 * 60 * 60),  # 1 Day Window
                 trigger=AfterWatermark(early=AfterCount(1)),
                 accumulation_mode=AccumulationMode.ACCUMULATING,
                 allowed_lateness=Duration(seconds=0))
             | beam.CombineGlobally(
                 beam.combiners.CountCombineFn()).without_defaults())
示例#17
0
 def test_fixed_after_first(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab')]},
         1,
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
示例#18
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))
示例#20
0
  def test_sessions_after_each(self):
    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        AfterEach(AfterCount(2), AfterCount(3)),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')]},
        2)

    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        Repeatedly(AfterEach(AfterCount(2), AfterCount(3))),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')],
         IntervalWindow(0, 17): [set('abcdefgh')]},
        2)
示例#21
0
文件: util.py 项目: wangjie05/beam
  def expand(self, pcoll):
    windowing_saved = pcoll.windowing
    if windowing_saved.is_default():
      # In this (common) case we can use a trivial trigger driver
      # and avoid the (expensive) window param.
      globally_windowed = window.GlobalWindows.windowed_value(None)
      window_fn = window.GlobalWindows()
      MIN_TIMESTAMP = window.MIN_TIMESTAMP

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        if timestamp == MIN_TIMESTAMP:
          timestamp = None
        return key, (value, timestamp)

      def restore_timestamps(element):
        key, values = element
        return [
            globally_windowed.with_value((key, value))
            if timestamp is None
            else window.GlobalWindows.windowed_value((key, value), timestamp)
            for (value, timestamp) in values]

    else:
      # The linter is confused.
      # hash(1) is used to force "runtime" selection of _IdentityWindowFn
      # pylint: disable=abstract-class-instantiated
      cls = hash(1) and _IdentityWindowFn
      window_fn = cls(
          windowing_saved.windowfn.get_window_coder())

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        return key, TimestampedValue(value, timestamp)

      def restore_timestamps(element, window=DoFn.WindowParam):
        # Pass the current window since _IdentityWindowFn wouldn't know how
        # to generate it.
        key, values = element
        return [
            windowed_value.WindowedValue(
                (key, value.value), value.timestamp, [window])
            for value in values]

    ungrouped = pcoll | Map(reify_timestamps)
    ungrouped._windowing = Windowing(
        window_fn,
        triggerfn=AfterCount(1),
        accumulation_mode=AccumulationMode.DISCARDING,
        timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
    result = (ungrouped
              | GroupByKey()
              | FlatMap(restore_timestamps))
    result._windowing = windowing_saved
    return result
示例#22
0
    def expand(self, pcoll):
        windowing_saved = pcoll.windowing
        if windowing_saved.is_default():
            # In this (common) case we can use a trivial trigger driver
            # and avoid the (expensive) window param.
            globally_windowed = window.GlobalWindows.windowed_value(None)
            MIN_TIMESTAMP = window.MIN_TIMESTAMP

            def reify_timestamps(element, timestamp=DoFn.TimestampParam):
                key, value = element
                if timestamp == MIN_TIMESTAMP:
                    timestamp = None
                return key, (value, timestamp)

            def restore_timestamps(element):
                key, values = element
                return [
                    globally_windowed.with_value((key, value)) if
                    timestamp is None else window.GlobalWindows.windowed_value(
                        (key, value), timestamp)
                    for (value, timestamp) in values
                ]

        else:

            def reify_timestamps(element,
                                 timestamp=DoFn.TimestampParam,
                                 window=DoFn.WindowParam):
                key, value = element
                # Transport the window as part of the value and restore it later.
                return key, windowed_value.WindowedValue(
                    value, timestamp, [window])

            def restore_timestamps(element):
                key, windowed_values = element
                return [
                    wv.with_value((key, wv.value)) for wv in windowed_values
                ]

        ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

        # TODO(BEAM-8104) Using global window as one of the standard window.
        # This is to mitigate the Dataflow Java Runner Harness limitation to
        # accept only standard coders.
        ungrouped._windowing = Windowing(
            window.GlobalWindows(),
            triggerfn=Repeatedly(AfterCount(1)),
            accumulation_mode=AccumulationMode.DISCARDING,
            timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
        result = (ungrouped
                  | GroupByKey()
                  | FlatMap(restore_timestamps).with_output_types(Any))
        result._windowing = windowing_saved
        return result
示例#23
0
 def test_sessions_watermark_with_early_late(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(early=AfterCount(2), late=AfterCount(1)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')],
         {
             IntervalWindow(1, 25): [
                 set('abc'),  # early
                 set('abc'),  # on time
                 set('abcxy')  # late
             ],
             IntervalWindow(30, 40): [
                 set('d'),  # on time
             ],
             IntervalWindow(1, 40): [
                 set('abcdxyz')  # late
             ],
         },
         2,
         late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
示例#24
0
 def test_sessions_after_count(self):
   self.run_trigger_simple(
       Sessions(10),  # pyformat break
       AfterCount(2),
       AccumulationMode.ACCUMULATING,
       [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'),
        (50, 'y')],
       {IntervalWindow(1, 25): [set('abc')],
        IntervalWindow(30, 41): [set('st')],
        IntervalWindow(50, 60): [set('yz')]},
       1,
       2,
       3)
示例#25
0
 def test_repeatedly_after_first(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       Repeatedly(AfterAny(AfterCount(3), AfterWatermark())),
       AccumulationMode.ACCUMULATING,
       zip(range(7), 'abcdefg'),
       {IntervalWindow(0, 100): [
           set('abc'),
           set('abcdef'),
           set('abcdefg'),
           set('abcdefgx'),
           set('abcdefgxy'),
           set('abcdefgxyz')]},
       1,
       late_data=zip(range(3), 'xyz'))
    def expand(self, pcoll):

        output = (pcoll
                  | "ParseJson" >> beam.ParDo(JsonToTaxiRide())
                  | "FilterForPickups" >>
                  beam.Filter(lambda x: x.ride_status == 'pickup')
                  | "WindowByMinute" >> beam.WindowInto(
                      beam.window.FixedWindows(60),
                      trigger=AfterWatermark(late=AfterCount(1)),
                      allowed_lateness=60,
                      accumulation_mode=AccumulationMode.ACCUMULATING)
                  | "CountPerMinute" >> beam.CombineGlobally(
                      CountCombineFn()).without_defaults())

        return output
示例#27
0
 def test_after_count(self):
   p = Pipeline('DirectRunner')
   result = (p
             | beam.Create([1, 2, 3, 4, 5, 10, 11])
             | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
             | beam.Map(lambda (k, t): TimestampedValue((k, t), t))
             | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3),
                               accumulation_mode=AccumulationMode.DISCARDING)
             | beam.GroupByKey()
             | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
   assert_that(result, equal_to(
       {
           'A-5': {1, 2, 3, 4, 5},
           # A-10, A-11 never emitted due to AfterCount(3) never firing.
           'B-4': {6, 7, 8, 9},
           'B-3': {10, 15, 16},
       }.iteritems()))
示例#28
0
文件: util_test.py 项目: Hzwords/beam
  def test_buffering_timer_in_global_window_streaming(self):
    max_buffering_duration_secs = 42

    start_time = timestamp.Timestamp(0)
    test_stream = TestStream().advance_watermark_to(start_time)
    for i, value in enumerate(GroupIntoBatchesTest._create_test_data()):
      test_stream.add_elements(
          [TimestampedValue(value, start_time + i)]) \
        .advance_processing_time(5)
    test_stream.advance_watermark_to(
        start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \
      .advance_watermark_to_infinity()

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # Set a batch size larger than the total number of elements.
      # Since we're in a global window, we would have been waiting
      # for all the elements to arrive without the buffering time limit.
      batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2

      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0). Since the fake clock never really advances during
      # the pipeline execution, meaning that the timer is always set to the same
      # value, the timer will be fired on every element after the first firing.
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | WindowInto(
              GlobalWindows(),
              trigger=Repeatedly(AfterCount(1)),
              accumulation_mode=trigger.AccumulationMode.DISCARDING)
          | util.GroupIntoBatches(
              batch_size, max_buffering_duration_secs, fake_clock)
          | 'count elements in batch' >> Map(lambda x: (None, len(x[1])))
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # We will flush twice when the max buffering duration is reached and when
      # the global window ends.
      assert_that(num_elements_per_batch, equal_to([9, 1]))
示例#29
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
示例#30
0
    def test_combining_with_accumulation_mode_and_fanout(self):
        # PCollection will contain elements from 1 to 5.
        elements = [i for i in range(1, 6)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([i])
        ts.advance_watermark_to_infinity()

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            result = (
                p
                | ts
                | beam.WindowInto(
                    GlobalWindows(),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
                | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

            # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0].
            firings = [1, 3, 6, 10, 15, 15, 15]
            assert_that(result, equal_to(firings))