Exemplo n.º 1
0
 def decode_from_stream(self, in_, nested):
   global IntervalWindow
   if IntervalWindow is None:
     from apache_beam.transforms.window import IntervalWindow
   typed_value = IntervalWindow(None, None)
   typed_value._end_micros = (
       1000 * self._to_normal_time(in_.read_bigendian_uint64()))
   typed_value._start_micros = (
       typed_value._end_micros - 1000 * in_.read_var_int64())
   return typed_value
Exemplo n.º 2
0
 def test_sessions_after_count(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterCount(2),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'),
          (50, 'y')],
         {
             IntervalWindow(1, 25): [set('abc')],
             IntervalWindow(30, 41): [set('st')],
             IntervalWindow(50, 60): [set('yz')]
         },
         1,
         2,
         3)
Exemplo n.º 3
0
 def test_fixed_watermark_with_early(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(early=AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab'), set('abc')]},
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(early=AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc'), set('abc')]},
         3)
Exemplo n.º 4
0
 def test_reified_value_assert_fail_unmatched_window(self):
   expected = [TestWindowedValue(v, MIN_TIMESTAMP, [IntervalWindow(0, 1)])
               for v in [1, 2, 3]]
   with self.assertRaises(Exception):
     with TestPipeline() as p:
       assert_that(p | Create([2, 3, 1]), equal_to(expected),
                   reify_windows=True)
Exemplo n.º 5
0
 def decode_from_stream(self, in_, nested):
     end_millis = self._to_normal_time(in_.read_bigendian_uint64())
     start_millis = end_millis - in_.read_var_int64()
     from apache_beam.transforms.window import IntervalWindow
     ret = IntervalWindow(start=Timestamp(micros=start_millis * 1000),
                          end=Timestamp(micros=end_millis * 1000))
     return ret
    def test_late_data_behavior(self):

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True

        with TestPipeline(options=options) as p:

            base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                        "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                        "\"ride_status\":\"pickup\",\"passenger_count\":1}"

            test_stream = TestStream().advance_watermark_to(0).add_elements([
                TimestampedValue(base_json_pickup, 0),
                TimestampedValue(base_json_pickup, 0),
            ]).advance_watermark_to(
                60).advance_processing_time(60).add_elements([
                    TimestampedValue(base_json_pickup, 0)
                ]).advance_watermark_to(300).advance_processing_time(
                    240).add_elements([TimestampedValue(base_json_pickup, 0)])

            EXPECTED_RESULTS = {
                IntervalWindow(0, 60): [2, 3]
            }  #On Time and Late Result

            taxi_counts_late = (p | test_stream | TaxiCountTransform())

            assert_that(taxi_counts_late,
                        equal_to_per_window(EXPECTED_RESULTS),
                        reify_windows=True)
Exemplo n.º 7
0
 def test_fixed_watermark(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (13, 'c')],
         {
             IntervalWindow(0, 10): [set('ab')],
             IntervalWindow(10, 20): [set('c')]
         },
         1,
         2,
         3,
         -3,
         -2,
         -1)
Exemplo n.º 8
0
 def test_reshuffle_window_fn_preserved(self):
   any_order = contains_in_any_order
   with TestPipeline() as pipeline:
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_windows = [
         TestWindowedValue(v, t, [w])
         for (v, t, w) in [((1, 1), 1.0, IntervalWindow(1.0, 3.0)), (
             (2, 1), 1.0, IntervalWindow(1.0, 3.0)), (
                 (3, 1), 1.0, IntervalWindow(1.0, 3.0)), (
                     (1, 2), 2.0, IntervalWindow(2.0, 4.0)), (
                         (2, 2), 2.0,
                         IntervalWindow(2.0, 4.0)), ((1, 4),
                                                     4.0,
                                                     IntervalWindow(4.0, 6.0))]
     ]
     expected_merged_windows = [
         TestWindowedValue(v, t - .001, [w])
         for (v, t,
              w) in [((1, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), (
                  (2, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), (
                      (3, [1]), 3.0,
                      IntervalWindow(1.0, 3.0)), ((1, [4]),
                                                  6.0,
                                                  IntervalWindow(4.0, 6.0))]
     ]
     before_reshuffle = (
         pipeline
         | 'start' >> beam.Create(data)
         | 'add_timestamp' >> beam.Map(lambda v: TimestampedValue(v, v[1]))
         | 'window' >> beam.WindowInto(Sessions(gap_size=2)))
     assert_that(
         before_reshuffle,
         equal_to(expected_windows),
         label='before_reshuffle',
         reify_windows=True)
     after_reshuffle = before_reshuffle | beam.Reshuffle()
     assert_that(
         after_reshuffle,
         equal_to(expected_windows),
         label='after_reshuffle',
         reify_windows=True)
     after_group = after_reshuffle | beam.GroupByKey()
     assert_that(
         after_group,
         equal_to(expected_merged_windows),
         label='after_group',
         reify_windows=True)
Exemplo n.º 9
0
 def test_fixed_after_count(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterCount(2),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')],
         {IntervalWindow(0, 10): [set('ab')]},
         1,
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterCount(2),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')],
         {IntervalWindow(0, 10): [set('abc')]},
         3,
         4)
Exemplo n.º 10
0
 def test_sessions_repeatedly_after_count(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('abcde')]},
         1,
         3)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.DISCARDING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('de')]},
         1,
         3)
Exemplo n.º 11
0
 def test_fixed_after_first(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab')]},
         1,
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
Exemplo n.º 12
0
 def test_sessions_after_all(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abc')]},
         1,
         2)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abcxy')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
Exemplo n.º 13
0
    def test_fixed_windows_simple_watermark(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0),
                             tsv('k1', 2, 0), tsv('k2', 2, 0)])
              .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)])
              .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)])
              .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)])
              .advance_watermark_to(1)
              .add_elements([tsv('k1', 6, 0)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k2', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k1', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k2', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k1', IntervalWindow(0, 1), [6]),  # After the watermark
                ]))
Exemplo n.º 14
0
    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))
Exemplo n.º 15
0
  def test_sessions_after_each(self):
    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        AfterEach(AfterCount(2), AfterCount(3)),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')]},
        2)

    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        Repeatedly(AfterEach(AfterCount(2), AfterCount(3))),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')],
         IntervalWindow(0, 17): [set('abcdefgh')]},
        2)
Exemplo n.º 16
0
  def test_param_windowed_value_coder(self):
    from apache_beam.transforms.window import IntervalWindow
    from apache_beam.utils.windowed_value import PaneInfo
    wv = windowed_value.create(
        b'',
        # Milliseconds to microseconds
        1000 * 1000,
        (IntervalWindow(11, 21),),
        PaneInfo(True, False, 1, 2, 3))
    windowed_value_coder = coders.WindowedValueCoder(
        coders.BytesCoder(), coders.IntervalWindowCoder())
    payload = windowed_value_coder.encode(wv)
    coder = coders.ParamWindowedValueCoder(
        payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()])

    # Test binary representation
    self.assertEqual(b'\x01',
                     coder.encode(window.GlobalWindows.windowed_value(1)))

    # Test unnested
    self.check_coder(
        coders.ParamWindowedValueCoder(
            payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]),
        windowed_value.WindowedValue(
            3,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)),
        windowed_value.WindowedValue(
            1,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)))

    # Test nested
    self.check_coder(
        coders.TupleCoder((
            coders.ParamWindowedValueCoder(
                payload, [
                    coders.FloatCoder(),
                    coders.IntervalWindowCoder()]),
            coders.ParamWindowedValueCoder(
                payload, [
                    coders.StrUtf8Coder(),
                    coders.IntervalWindowCoder()]))),
        (windowed_value.WindowedValue(
            1.5,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)),
         windowed_value.WindowedValue(
             "abc",
             1,
             (window.IntervalWindow(11, 21),),
             PaneInfo(True, False, 1, 2, 3))))
Exemplo n.º 17
0
 def test_reshuffle_windows_unchanged(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
   expected_data = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)),
       ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]]
   before_reshuffle = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamp' >> beam.Map(
                           lambda v: beam.window.TimestampedValue(v, v[1]))
                       | 'window' >> beam.WindowInto(Sessions(gap_size=2))
                       | 'group_by_key' >> beam.GroupByKey())
   assert_that(before_reshuffle, equal_to(expected_data),
               label='before_reshuffle', reify_windows=True)
   after_reshuffle = before_reshuffle | beam.Reshuffle()
   assert_that(after_reshuffle, equal_to(expected_data),
               label='after reshuffle', reify_windows=True)
   pipeline.run()
Exemplo n.º 18
0
 def test_sessions_watermark(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b')],
         {IntervalWindow(1, 12): [set('ab')]},
         1,
         2,
         -2,
         -1)
Exemplo n.º 19
0
 def test_sessions_watermark_with_early_late(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(early=AfterCount(2), late=AfterCount(1)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')],
         {
             IntervalWindow(1, 25): [
                 set('abc'),  # early
                 set('abc'),  # on time
                 set('abcxy')  # late
             ],
             IntervalWindow(30, 40): [
                 set('d'),  # on time
             ],
             IntervalWindow(1, 40): [
                 set('abcdxyz')  # late
             ],
         },
         2,
         late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
Exemplo n.º 20
0
 def test_shard_naming(self):
     namer = fileio.default_file_naming(prefix='/path/to/file',
                                        suffix='.txt')
     self.assertEqual(namer(GlobalWindow(), None, None, None, None, None),
                      '/path/to/file.txt')
     self.assertEqual(namer(GlobalWindow(), None, 1, 5, None, None),
                      '/path/to/file-00001-of-00005.txt')
     self.assertEqual(namer(GlobalWindow(), None, 1, 5, 'gz', None),
                      '/path/to/file-00001-of-00005.txt.gz')
     self.assertEqual(
         namer(IntervalWindow(0, 100), None, 1, 5, None, None),
         '/path/to/file'
         '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt')
Exemplo n.º 21
0
    def test_sessions_merging(self):
        windowfn = Sessions(10)

        def merge(*timestamps):
            windows = [
                windowfn.assign(context(None, t, [])) for t in timestamps
            ]
            running = set()

            class TestMergeContext(WindowFn.MergeContext):
                def __init__(self):
                    super(TestMergeContext, self).__init__(running)

                def merge(self, to_be_merged, merge_result):
                    for w in to_be_merged:
                        if w in running:
                            running.remove(w)
                    running.add(merge_result)

            for ws in windows:
                running.update(ws)
                windowfn.merge(TestMergeContext())
            windowfn.merge(TestMergeContext())
            return sorted(running)

        self.assertEqual([IntervalWindow(2, 12)], merge(2))
        self.assertEqual([IntervalWindow(2, 12),
                          IntervalWindow(19, 29)], merge(2, 19))

        self.assertEqual([IntervalWindow(2, 19)], merge(2, 9))
        self.assertEqual([IntervalWindow(2, 19)], merge(9, 2))

        self.assertEqual([IntervalWindow(2, 19),
                          IntervalWindow(19, 29)], merge(2, 9, 19))
        self.assertEqual([IntervalWindow(2, 19),
                          IntervalWindow(19, 29)], merge(19, 9, 2))

        self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10))
Exemplo n.º 22
0
 def test_fixed_watermark_with_early_late(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       AfterWatermark(early=AfterCount(3),
                      late=AfterCount(2)),
       AccumulationMode.DISCARDING,
       zip(range(9), 'abcdefghi'),
       {IntervalWindow(0, 100): [
           set('abcd'), set('efgh'),  # early
           set('i'),                  # on time
           set('vw'), set('xy')       # late
           ]},
       2,
       late_data=zip(range(5), 'vwxyz'))
Exemplo n.º 23
0
 def merge(self, merge_context):
     to_merge = []
     end = MIN_TIMESTAMP
     _logger.info("%d windows" % len(merge_context.windows))
     for w in sorted(merge_context.windows, key=lambda w: w.start):
         _logger.info("WINDOW: (%s, %s)" %
                      (format_timestamp(w.start), format_timestamp(w.end)))
         if to_merge:
             if end > w.start:
                 # window `w` overlaps with `to_merge`: add it
                 to_merge.append(w)
                 if w.end == MAX_TIMESTAMP:
                     _logger.info("FINAL: (%s, %s)" % (format_timestamp(
                         to_merge[0].start), format_timestamp(end)))
                     # we don't want any more windows on this key
                     end = w.start
                     break
                 elif w.end > end:
                     end = w.end
             else:
                 # FIXME: this check seems superfluous
                 if len(to_merge) > 1:
                     _logger.info("NEW: (%s, %s)" % (format_timestamp(
                         to_merge[0].start), format_timestamp(end)))
                     merge_context.merge(
                         to_merge, IntervalWindow(to_merge[0].start, end))
                 to_merge = [w]
                 end = w.end
         else:
             to_merge = [w]
             end = w.end
     if len(to_merge) > 1:
         _logger.info(
             "NEW: (%s, %s)" %
             (format_timestamp(to_merge[0].start), format_timestamp(end)))
         merge_context.merge(to_merge,
                             IntervalWindow(to_merge[0].start, end))
Exemplo n.º 24
0
 def test_repeatedly_after_first(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       Repeatedly(AfterAny(AfterCount(3), AfterWatermark())),
       AccumulationMode.ACCUMULATING,
       zip(range(7), 'abcdefg'),
       {IntervalWindow(0, 100): [
           set('abc'),
           set('abcdef'),
           set('abcdefg'),
           set('abcdefgx'),
           set('abcdefgxy'),
           set('abcdefgxyz')]},
       1,
       late_data=zip(range(3), 'xyz'))
Exemplo n.º 25
0
  def test_sessions_watermark(self):
    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        AfterWatermark(),
        AccumulationMode.ACCUMULATING,
        [(1, 'a'), (2, 'b')],
        {IntervalWindow(1, 12): [set('ab')]},
        1,
        2)

    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        AfterWatermark(),
        AccumulationMode.ACCUMULATING,
        [(1, 'a'), (2, 'b'), (15, 'c'), (16, 'd'), (30, 'z'), (9, 'e'),
         (10, 'f'), (30, 'y')],
        {IntervalWindow(1, 26): [set('abcdef')],
         IntervalWindow(30, 40): [set('yz')]},
        1,
        2,
        3,
        4,
        5,
        6)
Exemplo n.º 26
0
    def test_sliding_windows_simple_watermark(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k2', 1)])
              .advance_watermark_to(1)
              .add_elements([('k1', 2), ('k2', 2)])
              .add_elements([('k1', 2), ('k2', 2)])
              .advance_watermark_to(2)
              .add_elements([('k1', 3), ('k2', 3)])
              .add_elements([('k1', 3), ('k2', 3)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(SlidingWindows(2, 1))

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(-1, 1), [1, 1, 1]),
                    ('k2', IntervalWindow(-1, 1), [1, 1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]),
                    ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]),
                    ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]),
                    ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]),
                    ('k1', IntervalWindow(2, 4), [3, 3]),
                    ('k2', IntervalWindow(2, 4), [3, 3]),
                ]))
Exemplo n.º 27
0
  def test_fixed_windows(self):
    # Test windows with offset: 2, 7, 12, 17, ...
    windowfn = FixedWindows(size=5, offset=2)
    self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 7)))
    self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
    self.assertEqual([IntervalWindow(12, 17)],
                     windowfn.assign(context('v', 12)))

    # Test windows without offset: 0, 5, 10, 15, ...
    windowfn = FixedWindows(size=5)
    self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 5)))
    self.assertEqual([IntervalWindow(5, 10)], windowfn.assign(context('v', 9)))
    self.assertEqual([IntervalWindow(10, 15)],
                     windowfn.assign(context('v', 10)))

    # Test windows with offset out of range.
    windowfn = FixedWindows(size=5, offset=12)
    self.assertEqual([IntervalWindow(7, 12)], windowfn.assign(context('v', 11)))
Exemplo n.º 28
0
    def test_with_trigger_window_that_finish(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)])
              .add_elements([tsv('k1', 3, 0)])
              .advance_watermark_to(2)
              .add_elements([tsv('k1', 6, 0)])  # This value is discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              triggerfn=AfterWatermark(),
                              allowed_lateness=0,
                              accumulation_mode=AccumulationMode.DISCARDING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                ]))
Exemplo n.º 29
0
    def _execute(self, window_fn, trigger_fn, accumulation_mode,
                 timestamp_combiner, allowed_lateness, transcript, spec):
        if timestamp_combiner == TimestampCombiner.OUTPUT_AT_EARLIEST_TRANSFORMED:
            self.skipTest('Non-fnapi timestamp combiner: %s' %
                          spec.get('timestamp_combiner'))

        if accumulation_mode != AccumulationMode.ACCUMULATING:
            self.skipTest('Batch mode only makes sense for accumulating.')

        watermark = MIN_TIMESTAMP
        for action, params in transcript:
            if action == 'watermark':
                watermark = params
            elif action == 'input':
                if any(t <= watermark for t in params):
                    self.skipTest('Batch mode never has late data.')

        inputs = sum([vs for action, vs in transcript if action == 'input'],
                     [])
        final_panes_by_window = {}
        for action, params in transcript:
            if action == 'expect':
                for expected in params:
                    trimmed = {}
                    for field in ('window', 'values', 'timestamp'):
                        if field in expected:
                            trimmed[field] = expected[field]
                    final_panes_by_window[tuple(expected['window'])] = trimmed
        final_panes = list(final_panes_by_window.values())

        if window_fn.is_merging():
            merged_away = set()

            class MergeContext(WindowFn.MergeContext):
                def merge(_, to_be_merged, merge_result):
                    for window in to_be_merged:
                        if window != merge_result:
                            merged_away.add(window)

            all_windows = [
                IntervalWindow(*pane['window']) for pane in final_panes
            ]
            window_fn.merge(MergeContext(all_windows))
            final_panes = [
                pane for pane in final_panes
                if IntervalWindow(*pane['window']) not in merged_away
            ]

        with TestPipeline() as p:
            input_pc = (p
                        | beam.Create(inputs)
                        | beam.Map(lambda t: TimestampedValue(('key', t), t))
                        | beam.WindowInto(
                            window_fn,
                            trigger=trigger_fn,
                            accumulation_mode=accumulation_mode,
                            timestamp_combiner=timestamp_combiner,
                            allowed_lateness=allowed_lateness))

            grouped = input_pc | 'Grouped' >> (
                beam.GroupByKey()
                | beam.MapTuple(_windowed_value_info_map_fn)
                | beam.MapTuple(lambda _, value: value))

            combined = input_pc | 'Combined' >> (
                beam.CombinePerKey(_ConcatCombineFn())
                | beam.MapTuple(_windowed_value_info_map_fn)
                | beam.MapTuple(lambda _, value: value))

            assert_that(
                grouped,
                lambda actual: _windowed_value_info_check(actual, final_panes),
                label='CheckGrouped')

            assert_that(
                combined,
                lambda actual: _windowed_value_info_check(actual, final_panes),
                label='CheckCombined')
Exemplo n.º 30
0
class StandardCodersTest(unittest.TestCase):

    _urn_to_json_value_parser = {
        'beam:coder:bytes:v1':
        lambda x: x.encode('utf-8'),
        'beam:coder:string_utf8:v1':
        lambda x: x,
        'beam:coder:varint:v1':
        lambda x: x,
        'beam:coder:kv:v1':
        lambda x, key_parser, value_parser:
        (key_parser(x['key']), value_parser(x['value'])),
        'beam:coder:interval_window:v1':
        lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span'])
                                                 * 1000),
                                 end=Timestamp(micros=x['end'] * 1000)),
        'beam:coder:iterable:v1':
        lambda x, parser: list(map(parser, x)),
        'beam:coder:global_window:v1':
        lambda x: window.GlobalWindow(),
        'beam:coder:windowed_value:v1':
        lambda x, value_parser, window_parser: windowed_value.create(
            value_parser(x['value']), x['timestamp'] * 1000,
            tuple([window_parser(w) for w in x['windows']])),
        'beam:coder:timer:v1':
        lambda x, payload_parser: dict(payload=payload_parser(x['payload']),
                                       timestamp=Timestamp(micros=x['timestamp'
                                                                    ] * 1000)),
        'beam:coder:double:v1':
        parse_float,
    }

    def test_standard_coders(self):
        for name, spec in _load_test_cases(STANDARD_CODERS_YAML):
            logging.info('Executing %s test.', name)
            self._run_standard_coder(name, spec)

    def _run_standard_coder(self, name, spec):
        def assert_equal(actual, expected):
            """Handle nan values which self.assertEqual fails on."""
            if (isinstance(actual, float) and isinstance(expected, float)
                    and math.isnan(actual) and math.isnan(expected)):
                return
            self.assertEqual(actual, expected)

        coder = self.parse_coder(spec['coder'])
        parse_value = self.json_value_parser(spec['coder'])
        nested_list = [spec['nested']] if 'nested' in spec else [True, False]
        for nested in nested_list:
            for expected_encoded, json_value in spec['examples'].items():
                value = parse_value(json_value)
                expected_encoded = expected_encoded.encode('latin1')
                if not spec['coder'].get('non_deterministic', False):
                    actual_encoded = encode_nested(coder, value, nested)
                    if self.fix and actual_encoded != expected_encoded:
                        self.to_fix[spec['index'],
                                    expected_encoded] = actual_encoded
                    else:
                        self.assertEqual(expected_encoded, actual_encoded)
                        decoded = decode_nested(coder, expected_encoded,
                                                nested)
                        assert_equal(decoded, value)
                else:
                    # Only verify decoding for a non-deterministic coder
                    self.assertEqual(
                        decode_nested(coder, expected_encoded, nested), value)

    def parse_coder(self, spec):
        context = pipeline_context.PipelineContext()
        coder_id = str(hash(str(spec)))
        component_ids = [
            context.coders.get_id(self.parse_coder(c))
            for c in spec.get('components', ())
        ]
        context.coders.put_proto(
            coder_id,
            beam_runner_api_pb2.Coder(spec=beam_runner_api_pb2.FunctionSpec(
                urn=spec['urn'], payload=spec.get('payload')),
                                      component_coder_ids=component_ids))
        return context.coders.get_by_id(coder_id)

    def json_value_parser(self, coder_spec):
        component_parsers = [
            self.json_value_parser(c)
            for c in coder_spec.get('components', ())
        ]
        return lambda x: self._urn_to_json_value_parser[coder_spec['urn']](
            x, *component_parsers)

    # Used when --fix is passed.

    fix = False
    to_fix = {}

    @classmethod
    def tearDownClass(cls):
        if cls.fix and cls.to_fix:
            print("FIXING", len(cls.to_fix), "TESTS")
            doc_sep = '\n---\n'
            docs = open(STANDARD_CODERS_YAML).read().split(doc_sep)

            def quote(s):
                return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0')

            for (doc_ix,
                 expected_encoded), actual_encoded in cls.to_fix.items():
                print(quote(expected_encoded), "->", quote(actual_encoded))
                docs[doc_ix] = docs[doc_ix].replace(
                    quote(expected_encoded) + ':',
                    quote(actual_encoded) + ':')
            open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
    def test_streaming_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a fake limiter that cancels the BCJ once the main job receives the
        # expected amount of results.
        class FakeLimiter:
            def __init__(self, p, pcoll):
                self.p = p
                self.pcoll = pcoll

            def is_triggered(self):
                result = ie.current_env().pipeline_result(self.p)
                if result:
                    try:
                        results = result.get(self.pcoll)
                    except ValueError:
                        return False
                    return len(results) >= 10
                return False

        # This sets the limiters to stop reading when the test receives 10 elements.
        ie.current_env().options.capture_control.set_limiters_for_test(
            [FakeLimiter(p, data)])

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame([
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('or', 0, [IntervalWindow(0, 10)], pane_info),
            ('not', 0, [IntervalWindow(0, 10)], pane_info),
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
        ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

        data_df = ib.collect(data, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

        counts_df = ib.collect(counts, include_window_info=True)

        # The group by key has no guarantee of order. So we post-process the DF by
        # sorting so we can test equality.
        sorted_counts_df = (counts_df
                            .sort_values(['event_time', 0], ascending=True)
                            .reset_index(drop=True)) # yapf: disable
        pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)