Exemplo n.º 1
0
  def test_generate_sequence_with_realtime_timer(self):
    from apache_beam.transforms.combiners import CountCombineFn

    class GenerateRecords(beam.DoFn):

      EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.REAL_TIME)
      COUNT_STATE = CombiningValueStateSpec(
          'count_state', VarIntCoder(), CountCombineFn())

      def __init__(self, frequency, total_records):
        self.total_records = total_records
        self.frequency = frequency

      def process(self,
                  element,
                  emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)):
        # Processing time timers should be set on ABSOLUTE TIME.
        emit_timer.set(self.frequency)
        yield element[1]

      @on_timer(EMIT_TIMER)
      def emit_values(self,
                      emit_timer=beam.DoFn.TimerParam(EMIT_TIMER),
                      count_state=beam.DoFn.StateParam(COUNT_STATE)):
        count = count_state.read() or 0
        if self.total_records == count:
          return

        count_state.add(1)
        # Processing time timers should be set on ABSOLUTE TIME.
        emit_timer.set(count + 1 + self.frequency)
        yield 'value'

    TOTAL_RECORDS = 3
    FREQUENCY = 1

    test_stream = (TestStream()
                   .advance_watermark_to(0)
                   .add_elements([('key', 0)])
                   .advance_processing_time(1) # Timestamp: 1
                   .add_elements([('key', 1)])
                   .advance_processing_time(1) # Timestamp: 2
                   .add_elements([('key', 2)])
                   .advance_processing_time(1) # Timestamp: 3
                   .add_elements([('key', 3)]))

    with beam.Pipeline(argv=['--streaming', '--runner=DirectRunner']) as p:
      _ = (p
           | test_stream
           | beam.ParDo(GenerateRecords(FREQUENCY, TOTAL_RECORDS))
           | beam.ParDo(self.record_dofn()))

    self.assertEqual(
        # 4 RECORDS go through process
        # 3 values are emitted from timer
        # Timestamp moves gradually.
        [0, 'value', 1, 'value', 2, 'value', 3],
        StatefulDoFnOnDirectRunnerTest.all_records)
Exemplo n.º 2
0
    def test_basic_execution_sideinputs_fixed_windows(self):
        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)

        main_stream = (
            p
            | 'main TestStream' >> TestStream().advance_watermark_to(
                9).add_elements(['a1', 'a2', 'a3', 'a4']).add_elements(
                    ['b']).advance_watermark_to(18).add_elements('c')
            | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1)))
        side_stream = (
            p
            | 'side TestStream' >> TestStream().advance_watermark_to(
                12).add_elements([window.TimestampedValue('s1', 10)
                                  ]).advance_watermark_to(20).add_elements(
                                      [window.TimestampedValue('s2', 20)])
            | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3)))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (
            main_stream  # pylint: disable=unused-variable
            | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

        # assert per window
        expected_window_to_elements = {
            window.IntervalWindow(9, 10): [('a1', Timestamp(9), ['s1']),
                                           ('a2', Timestamp(9), ['s1']),
                                           ('a3', Timestamp(9), ['s1']),
                                           ('a4', Timestamp(9), ['s1']),
                                           ('b', Timestamp(9), ['s1'])],
            window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])],
        }
        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
    def test_late_event_ignored(self):
        pipeline_options = PipelineOptions()
        pipeline_options.view_as(StandardOptions).streaming = True

        with TestPipeline(options=pipeline_options) as p:
            event_stream = (
                TestStream()
                .advance_watermark_to(10)
                .add_elements([live_offer_stat_event(account_id=1, offer_id=1),
                               live_offer_stat_event(
                    account_id=2, offer_id=1),
                    live_offer_stat_event(
                    account_id=3, offer_id=1),
                    live_offer_stat_event(
                    account_id=1, offer_id=2)
                ])
                .advance_processing_time(20)  # should kick early trigger
                # next events' timestamp will default to 20
                .advance_watermark_to(20)
                .add_elements([live_offer_stat_event(account_id=4, offer_id=1),
                               live_offer_stat_event(
                    account_id=2, offer_id=2),
                ])
                .advance_watermark_to(65)
                .add_elements([live_offer_stat_event(account_id=5, offer_id=1, late_secs=30),  # late event, should be ignored
                               # next window event
                               live_offer_stat_event(
                    account_id=1, offer_id=1)
                ])
                # kick early trigger on the second window
                .advance_processing_time(80)
                # next events' timestamp will default to 90
                .advance_watermark_to(90)
                .add_elements([live_offer_stat_event(account_id=2, offer_id=1)  # 2nd window after early event
                               ])

            )

            actual = p | event_stream | CountDistinctAccounts()

            expected = [
                # speculative for window 1 offer 1
                live_offer_stat_record(offer_id=1, count=3, window_start=0),
                # speculative for window 1 offer 2
                live_offer_stat_record(offer_id=2, count=1, window_start=0),
                # window end for window 1 offer 1
                live_offer_stat_record(offer_id=1, count=4, window_start=0),
                # window end for window 1 offer 2
                live_offer_stat_record(offer_id=2, count=2, window_start=0),
                # speculative for window 2 offer 1
                live_offer_stat_record(offer_id=1, count=1, window_start=60),
                # window end for window 2 offer 1
                live_offer_stat_record(offer_id=1, count=2, window_start=60)
            ]

            assert_that(actual, equal_to_with_timestamp_tolerance(
                expected=expected, timestamp_tolerance_secs=2))
def _build_a_test_stream_pipeline():
  test_stream = (
      TestStream().advance_watermark_to(0).add_elements([
          TimestampedValue('a', 1)
      ]).advance_processing_time(5).advance_watermark_to_infinity())
  p = beam.Pipeline(runner=interactive_runner.InteractiveRunner())
  events = p | test_stream  # pylint: disable=possibly-unused-variable
  ib.watch(locals())
  return p
Exemplo n.º 5
0
    def test_test_stream_errors(self):
        with self.assertRaises(
                AssertionError,
                msg=('Watermark must strictly-monotonically advance.')):
            _ = (TestStream().advance_watermark_to(5).advance_watermark_to(4))

        with self.assertRaises(
                AssertionError,
                msg=('Must advance processing time by positive amount.')):
            _ = (TestStream().advance_processing_time(-1))

        with self.assertRaises(
                AssertionError,
                msg=(
                    'Element timestamp must be before timestamp.MAX_TIMESTAMP.'
                )):
            _ = (TestStream().add_elements(
                [TimestampedValue('a', timestamp.MAX_TIMESTAMP)]))
 def test_pipeline_unbounded(self):
     with beam.Pipeline(options=PipelineOptions(["--streaming"])) as p:
         res = (p
                | TestStream().add_elements(
                    TestSportTracker.data).advance_watermark_to_infinity()
                | SportTrackerCalc()
                | beam.Map(lambda x: "%s:%d,%d" %
                           (x[0], round(x[2]), round(x[1]))))
         assert_that(res, equal_to(["track1:614,257", "track2:5641,1262"]))
Exemplo n.º 7
0
    def _pipeline_runner():
        with beam.Pipeline(runner=DirectRunner()) as p:
            ts = TestStream().advance_watermark_to(0)
            all_elements = iter(range(size))
            watermark = 0
            while True:
                next_batch = list(itertools.islice(all_elements, 100))
                if not next_batch:
                    break
                ts = ts.add_elements([(i, random.randint(0, 1000))
                                      for i in next_batch])
                watermark = watermark + 100
                ts = ts.advance_watermark_to(watermark)
            ts = ts.advance_watermark_to_infinity()

            input_pc = p | ts | WindowInto(FixedWindows(100))
            for i in range(NUM_PARALLEL_STAGES):
                _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
Exemplo n.º 8
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
Exemplo n.º 9
0
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    class RecordFn(beam.DoFn):
      def process(self, element=beam.DoFn.ElementParam,
                  timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    my_record_fn = RecordFn()
    records = p | test_stream | beam.ParDo(my_record_fn)

    assert_that(records, equal_to([
        ('a', timestamp.Timestamp(10)),
        ('b', timestamp.Timestamp(10)),
        ('c', timestamp.Timestamp(10)),
        ('d', timestamp.Timestamp(20)),
        ('e', timestamp.Timestamp(20)),
        ('late', timestamp.Timestamp(12)),
        ('last', timestamp.Timestamp(310)),]))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('a', Timestamp(10)),
            ('b', Timestamp(10)),
            ('c', Timestamp(10)),
            ('late', Timestamp(12))
        ],
        window.IntervalWindow(15, 30): [
            ('d', Timestamp(20)),
            ('e', Timestamp(20))
        ],
        window.IntervalWindow(300, 315): [
            ('last', Timestamp(310)),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    p.run()
Exemplo n.º 10
0
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(0)
                   .advance_processing_time(5)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(2)
                   .advance_processing_time(1)
                   .advance_watermark_to(4)
                   .advance_processing_time(1)
                   .advance_watermark_to(6)
                   .advance_processing_time(1)
                   .advance_watermark_to(8)
                   .advance_processing_time(1)
                   .advance_watermark_to(10)
                   .advance_processing_time(1)
                   .add_elements([TimestampedValue('1', 15),
                                  TimestampedValue('2', 15),
                                  TimestampedValue('3', 15)]))  # yapf: disable

    options = StandardOptions(streaming=True)
    p = TestPipeline(options=options)

    records = (
        p
        | test_stream
        | ReverseTestStream(sample_resolution_sec=1, output_tag=None))

    assert_that(
        records,
        equal_to_per_window({
            beam.window.GlobalWindow(): [
                [ProcessingTimeEvent(5), WatermarkEvent(0)],
                [
                    ElementEvent([
                        TimestampedValue('a', 0),
                        TimestampedValue('b', 0),
                        TimestampedValue('c', 0)
                    ])
                ],
                [ProcessingTimeEvent(1), WatermarkEvent(2000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(4000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(6000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(8000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(10000000)],
                [
                    ElementEvent([
                        TimestampedValue('1', 15),
                        TimestampedValue('2', 15),
                        TimestampedValue('3', 15)
                    ])
                ],
            ],
        }))

    p.run()
Exemplo n.º 11
0
    def test_dofn_process_keyparam_error_no_key(self):
        class DoFnProcessWithKeyparam(DoFn):
            def process(self, element, mykey=DoFn.KeyParam):
                yield "{key}-verify".format(key=mykey)

        pipeline_options = PipelineOptions()
        with self.assertRaises(ValueError),\
             TestPipeline(options=pipeline_options) as p:
            test_stream = (TestStream().advance_watermark_to(10).add_elements(
                [1, 2]))
            (p | test_stream | beam.ParDo(DoFnProcessWithKeyparam()))
Exemplo n.º 12
0
    def test_record_pipeline(self):
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        # pylint: disable=unused-variable
        _ = (p
             | TestStream()
                 .advance_watermark_to(0)
                 .advance_processing_time(1)
                 .add_elements(list(range(10)))
                 .advance_processing_time(1))  # yapf: disable

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a lmiter that stops the background caching job when something is
        # written to cache. This is used to make ensure that the pipeline is
        # functioning properly and that there are no data races with the test.
        class SizeLimiter(Limiter):
            def __init__(self, p):
                self.pipeline = p
                self._rm = None

            def set_recording_manager(self, rm):
                self._rm = rm

            def is_triggered(self):
                return self._rm.describe()['size'] > 0 if self._rm else False

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        size_limiter = SizeLimiter(p)
        rm = RecordingManager(p, test_limiters=[size_limiter])
        size_limiter.set_recording_manager(rm)
        self.assertEqual(rm.describe()['state'], PipelineState.STOPPED)
        self.assertTrue(rm.record_pipeline())

        # A recording is in progress, no need to start another one.
        self.assertFalse(rm.record_pipeline())

        for _ in range(60):
            if rm.describe()['state'] == PipelineState.CANCELLED:
                break
            time.sleep(1)
        self.assertTrue(
            rm.describe()['state'] == PipelineState.CANCELLED,
            'Test timed out waiting for pipeline to be cancelled. This indicates '
            'that the BackgroundCachingJob did not cache anything.')
Exemplo n.º 13
0
  def test_basic_execution_sideinputs(self):

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def recorded_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))
               | beam.Map(recorded_elements))

    p.run()

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('e', Timestamp(10), [2, 1, 7, 4])], result)
Exemplo n.º 14
0
  def test_basic_execution_sideinputs(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('e', Timestamp(10), [2, 1, 7, 4]),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])]))
    p.run()
    def test_pipeline_bounded(self):
        now = time.time()
        now = now - now % 60
        inputs = [("foo", randomPosition(now)), ("bar", randomPosition(now))]
        inputs.append(("foo", move(inputs[0][1], (1.0, 1.0), 3, 180)))
        inputs.append(("bar", move(inputs[1][1], (1.0, 1.0), 2, 240)))
        inputs.append(("foo", move(inputs[2][1], (1.0, 1.0), 2.5, 180)))
        inputs.append(("bar", move(inputs[3][1], (1.0, 1.0), 2.5, 120)))

        stream = TestStream()
        for item in inputs:
            stream = stream.add_elements([item], event_timestamp=item[1][2])
        with beam.Pipeline() as p:
            res = (p
                   | stream.advance_watermark_to_infinity().with_output_types(
                       typing.Tuple[str, typing.Tuple[float, float, float]])
                   | SportTrackerMotivation(60, 300))
            assert_that(
                res,
                equal_to([("foo", False), ("bar", True), ("foo", False),
                          ("bar", True)]))
Exemplo n.º 16
0
  def test_hash_join(self):
    class HashJoinStatefulDoFn(DoFn):
      BUFFER_STATE = BagStateSpec('buffer', BytesCoder())
      UNMATCHED_TIMER = TimerSpec('unmatched', TimeDomain.WATERMARK)

      def process(
          self,
          element,
          state=DoFn.StateParam(BUFFER_STATE),
          timer=DoFn.TimerParam(UNMATCHED_TIMER)):
        key, value = element
        existing_values = list(state.read())
        if not existing_values:
          state.add(value)
          timer.set(100)
        else:
          yield b'Record<%s,%s,%s>' % (key, existing_values[0], value)
          state.clear()
          timer.clear()

      @on_timer(UNMATCHED_TIMER)
      def expiry_callback(self, state=DoFn.StateParam(BUFFER_STATE)):
        buffered = list(state.read())
        assert len(buffered) == 1, buffered
        state.clear()
        yield b'Unmatched<%s>' % (buffered[0], )

    with TestPipeline() as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements([
              (b'A', b'a'), (b'B', b'b')
          ]).add_elements([
              (b'A', b'aa'), (b'C', b'c')
          ]).advance_watermark_to(25).add_elements([
              (b'A', b'aaa'), (b'B', b'bb')
          ]).add_elements([
              (b'D', b'd'), (b'D', b'dd'), (b'D', b'ddd'), (b'D', b'dddd')
          ]).advance_watermark_to(125).add_elements([(b'C', b'cc')]))
      (
          p
          | test_stream
          | beam.ParDo(HashJoinStatefulDoFn())
          | beam.ParDo(self.record_dofn()))

    equal_to(StatefulDoFnOnDirectRunnerTest.all_records)([
        b'Record<A,a,aa>',
        b'Record<B,b,bb>',
        b'Record<D,d,dd>',
        b'Record<D,ddd,dddd>',
        b'Unmatched<aaa>',
        b'Unmatched<c>',
        b'Unmatched<cc>'
    ])
Exemplo n.º 17
0
  def test_combining_with_accumulation_mode_and_fanout(self):
    # PCollection will contain elements from 1 to 5.
    elements = [i for i in range(1, 6)]

    ts = TestStream().advance_watermark_to(0)
    for i in elements:
      ts.add_elements([i])
    ts.advance_watermark_to_infinity()

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    with TestPipeline(options=options) as p:
      result = (
          p
          | ts
          | beam.WindowInto(
              GlobalWindows(),
              accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
              trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
          | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

      def has_expected_values(actual):
        from hamcrest.core import assert_that as hamcrest_assert
        from hamcrest.library.collection import contains
        from hamcrest.library.collection import only_contains
        ordered = sorted(actual)
        # Early firings.
        hamcrest_assert(ordered[:4], contains(1, 3, 6, 10))
        # Different runners have different number of 15s, but there should
        # be at least one 15.
        hamcrest_assert(ordered[4:], only_contains(15))

      assert_that(result, has_expected_values)
Exemplo n.º 18
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)
        rm.record([squares], max_n=10, max_duration=2)
        first_recording_start = rm.describe()['start']
        rm.cancel()

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)
        cache = ie.current_env().get_cache_manager(p)
        cache_key = pipeline_instrument.cache_key(squares)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        cache.clear = MagicMock()

        # Rerun the fragment. If the cache was cleared correctly then the starting
        # time of the second recording will be later than the first. This is because
        # the PCollection wasn't considered to be computedand was cleared from
        # cache. Thus the pipeline fragment was rerun for that PCollection at a
        # later time.
        rm.record([squares], max_n=10, max_duration=1)
        second_recording_start = rm.describe()['start']
        rm.cancel()
        self.assertGreater(second_recording_start, first_recording_start)

        # Assert that the cache cleared the PCollection.
        cache.clear.assert_called_with('full', cache_key)
Exemplo n.º 19
0
  def test_roundtrip_proto(self):
    test_stream = (TestStream()
                   .advance_processing_time(1)
                   .advance_watermark_to(2)
                   .add_elements([1, 2, 3])) # yapf: disable

    p = TestPipeline(options=StandardOptions(streaming=True))
    p | test_stream

    pipeline_proto, context = p.to_runner_api(return_context=True)

    for t in pipeline_proto.components.transforms.values():
      if t.spec.urn == common_urns.primitives.TEST_STREAM.urn:
        test_stream_proto = t

    self.assertTrue(test_stream_proto)
    roundtrip_test_stream = TestStream().from_runner_api(
        test_stream_proto, context)

    self.assertListEqual(test_stream._events, roundtrip_test_stream._events)
    self.assertSetEqual(
        test_stream.output_tags, roundtrip_test_stream.output_tags)
    self.assertEqual(test_stream.coder, roundtrip_test_stream.coder)
Exemplo n.º 20
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(15, 30): [
            ('k', ['a', 'b', 'c']),
            ('k', ['late']),
        ],
        window.IntervalWindow(30, 45): [
            ('k', ['d', 'e']),
        ],
        window.IntervalWindow(300, 315): [
            ('k', ['last']),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    p.run()
  def test_multiple_outputs(self):
    """Tests that the TestStream supports emitting to multiple PCollections."""
    letters_elements = [
        TimestampedValue('a', 6),
        TimestampedValue('b', 7),
        TimestampedValue('c', 8),
    ]
    numbers_elements = [
        TimestampedValue('1', 11),
        TimestampedValue('2', 12),
        TimestampedValue('3', 13),
    ]
    test_stream = (TestStream()
        .advance_watermark_to(5, tag='letters')
        .add_elements(letters_elements, tag='letters')
        .advance_watermark_to(10, tag='numbers')
        .add_elements(numbers_elements, tag='numbers'))  # yapf: disable

    class RecordFn(beam.DoFn):
      def process(
          self,
          element=beam.DoFn.ElementParam,
          timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = StandardOptions(streaming=True)
    options.view_as(DebugOptions).add_experiment(
        'passthrough_pcollection_output_ids')
    p = TestPipeline(options=options)

    main = p | test_stream
    letters = main['letters'] | 'record letters' >> beam.ParDo(RecordFn())
    numbers = main['numbers'] | 'record numbers' >> beam.ParDo(RecordFn())

    assert_that(
        letters,
        equal_to([('a', Timestamp(6)), ('b', Timestamp(7)),
                  ('c', Timestamp(8))]),
        label='assert letters')

    assert_that(
        numbers,
        equal_to([('1', Timestamp(11)), ('2', Timestamp(12)),
                  ('3', Timestamp(13))]),
        label='assert numbers')

    p.run()
Exemplo n.º 22
0
  def test_fragment_does_not_prune_teststream(self):
    """Tests that the fragment does not prune the TestStream composite parts.
    """
    options = StandardOptions(streaming=True)
    p = beam.Pipeline(ir.InteractiveRunner(), options)

    test_stream = p | TestStream(output_tags=['a', 'b'])

    # pylint: disable=unused-variable
    a = test_stream['a'] | 'a' >> beam.Map(lambda _: _)
    b = test_stream['b'] | 'b' >> beam.Map(lambda _: _)

    fragment = pf.PipelineFragment([b]).deduce_fragment()

    # If the fragment does prune the TestStreawm composite parts, then the
    # resulting graph is invalid and the following call will raise an exception.
    fragment.to_runner_api()
Exemplo n.º 23
0
    def test_fixed_windows_simple_watermark(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0),
                             tsv('k1', 2, 0), tsv('k2', 2, 0)])
              .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)])
              .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)])
              .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)])
              .advance_watermark_to(1)
              .add_elements([tsv('k1', 6, 0)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k2', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k1', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k2', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k1', IntervalWindow(0, 1), [6]),  # After the watermark
                ]))
Exemplo n.º 24
0
  def test_buffering_timer_in_fixed_window_streaming(self):
    window_duration = 6
    max_buffering_duration_secs = 100

    start_time = timestamp.Timestamp(0)
    test_stream = (
        TestStream().add_elements([
            TimestampedValue(value, start_time + i) for i,
            value in enumerate(GroupIntoBatchesTest._create_test_data())
        ]).advance_processing_time(150).advance_watermark_to(
            start_time + window_duration).advance_watermark_to(
                start_time + window_duration +
                1).advance_watermark_to_infinity())

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0).
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | "fixed window" >> WindowInto(FixedWindows(window_duration))
          | util.GroupIntoBatches(
              GroupIntoBatchesTest.BATCH_SIZE,
              max_buffering_duration_secs,
              fake_clock)
          | "count elements in batch" >> Map(lambda x: (None, len(x[1])))
          | "global window" >> WindowInto(GlobalWindows())
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # Window duration is 6 and batch size is 5, so output batch size
      # should be 5 (flush because of batch size reached).
      expected_0 = 5
      # There is only one element left in the window so batch size
      # should be 1 (flush because of max buffering duration reached).
      expected_1 = 1
      # Collection has 10 elements, there are only 4 left, so batch size should
      # be 4 (flush because of end of window reached).
      expected_2 = 4
      assert_that(
          num_elements_per_batch,
          equal_to([expected_0, expected_1, expected_2]),
          "assert2")
Exemplo n.º 25
0
  def test_timer_output_timestamp(self):
    class TimerEmittingStatefulDoFn(DoFn):
      EMIT_TIMER_1 = TimerSpec('emit1', TimeDomain.WATERMARK)
      EMIT_TIMER_2 = TimerSpec('emit2', TimeDomain.WATERMARK)
      EMIT_TIMER_3 = TimerSpec('emit3', TimeDomain.WATERMARK)

      def process(self, element,
                  timer1=DoFn.TimerParam(EMIT_TIMER_1),
                  timer2=DoFn.TimerParam(EMIT_TIMER_2),
                  timer3=DoFn.TimerParam(EMIT_TIMER_3)):
        timer1.set(10)
        timer2.set(20)
        timer3.set(30)

      @on_timer(EMIT_TIMER_1)
      def emit_callback_1(self):
        yield 'timer1'

      @on_timer(EMIT_TIMER_2)
      def emit_callback_2(self):
        yield 'timer2'

      @on_timer(EMIT_TIMER_3)
      def emit_callback_3(self):
        yield 'timer3'

    class TimestampReifyingDoFn(DoFn):
      def process(self, element, ts=DoFn.TimestampParam):
        yield (element, int(ts))

    with TestPipeline() as p:
      test_stream = (TestStream()
                     .advance_watermark_to(10)
                     .add_elements([1]))
      (p
       | test_stream
       | beam.Map(lambda x: ('mykey', x))
       | beam.ParDo(TimerEmittingStatefulDoFn())
       | beam.ParDo(TimestampReifyingDoFn())
       | beam.ParDo(self.record_dofn()))

    self.assertEqual(
        [('timer1', 10), ('timer2', 20), ('timer3', 30)],
        sorted(StatefulDoFnOnDirectRunnerTest.all_records))
Exemplo n.º 26
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))
    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])], result)
Exemplo n.º 27
0
    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))
Exemplo n.º 28
0
  def test_gbk_execution_after_processing_trigger_fired(self):
    """Advance TestClock to (X + delta) and see the pipeline does finish."""
    # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired
    # Advance TestClock to (X + delta) and see the pipeline does finish
    # Possibly to the framework trigger_transcripts.yaml

    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a'])
                   .advance_processing_time(5.1))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(
                   beam.window.FixedWindows(15),
                   trigger=trigger.AfterProcessingTime(5),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING
                   )
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed.
    assert_that(records, equal_to([
        ('k', ['a'])]))

    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('k', ['a'])], result)
Exemplo n.º 29
0
    def test_cancel_stops_recording(self):
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)

        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        class SemaphoreLimiter(Limiter):
            def __init__(self):
                self.triggered = False

            def is_triggered(self):
                return self.triggered

        # Get the recording then the BackgroundCachingJob.
        semaphore_limiter = SemaphoreLimiter()
        rm = RecordingManager(p, test_limiters=[semaphore_limiter])
        rm.record([squares], max_n=10, max_duration=500)

        # The BackgroundCachingJob is still waiting for more elements, so it isn't
        # done yet.
        bcj = ie.current_env().get_background_caching_job(p)
        self.assertFalse(bcj.is_done())

        # Assert that something was read and that the BackgroundCachingJob was
        # sucessfully stopped.
        # self.assertTrue(list(recording.stream(squares).read()))
        semaphore_limiter.triggered = True
        rm.cancel()
        self.assertTrue(bcj.is_done())
Exemplo n.º 30
0
    def test_dynamic_timer_multiple(self):
        class DynamicTimerDoFn(DoFn):
            EMIT_TIMER_FAMILY1 = TimerSpec('emit_family_1',
                                           TimeDomain.WATERMARK)
            EMIT_TIMER_FAMILY2 = TimerSpec('emit_family_2',
                                           TimeDomain.WATERMARK)

            def process(self,
                        element,
                        emit1=DoFn.TimerParam(EMIT_TIMER_FAMILY1),
                        emit2=DoFn.TimerParam(EMIT_TIMER_FAMILY2)):
                emit1.set(10, dynamic_timer_tag='emit11')
                emit1.set(20, dynamic_timer_tag='emit12')
                emit1.set(30, dynamic_timer_tag='emit13')
                emit2.set(30, dynamic_timer_tag='emit21')
                emit2.set(20, dynamic_timer_tag='emit22')
                emit2.set(10, dynamic_timer_tag='emit23')

            @on_timer(EMIT_TIMER_FAMILY1)
            def emit_callback(self,
                              ts=DoFn.TimestampParam,
                              tag=DoFn.DynamicTimerTagParam):
                yield (tag, ts)

            @on_timer(EMIT_TIMER_FAMILY2)
            def emit_callback_2(self,
                                ts=DoFn.TimestampParam,
                                tag=DoFn.DynamicTimerTagParam):
                yield (tag, ts)

        with TestPipeline() as p:
            test_stream = (TestStream().advance_watermark_to(5).add_elements(
                ['1']).advance_watermark_to_infinity())
            (p
             | test_stream
             | beam.Map(lambda x: ('mykey', x))
             | beam.ParDo(DynamicTimerDoFn())
             | beam.ParDo(self.record_dofn()))

        self.assertEqual([('emit11', 10), ('emit12', 20), ('emit13', 30),
                          ('emit21', 30), ('emit22', 20), ('emit23', 10)],
                         sorted(StatefulDoFnOnDirectRunnerTest.all_records))