Exemplo n.º 1
0
    def test_as_pipeline(self, temp_dir, shards_per_day):

        file_path_base = temp_dir
        # file_path_base = 'gs://paul-scratch/TestDatePartitionedSink_temp'
        file_name_prefix = 'shard'
        file_path_prefix = pp.join(file_path_base, file_name_prefix)
        file_name_suffix = '.json'

        messages = list(self._sample_data())
        dates = {datetimeFromTimestamp(msg['timestamp']).strftime(DatePartitionedFileSink.DATE_FORMAT)
                 for msg in messages}

        with _TestPipeline() as p:
            writer = WriteToDatePartitionedFiles(file_path_prefix, file_name_suffix,
                                                            shards_per_day=shards_per_day)
            messages = (
                p
                | beam.Create(messages)
                | beam.Map(lambda msg: (TimestampedValue(msg, msg['timestamp'])))
            )

            result = messages | writer

            expected = []
            for date in dates:
                for shard_num in range(shards_per_day):
                    expected.append (
                        ''.join([pp.join(file_path_base, date, file_name_prefix), writer._sink.shard_name_format % dict(
                            shard_num=shard_num, num_shards=shards_per_day), file_name_suffix]
                    ))

            assert_that(result, equal_to(expected))
Exemplo n.º 2
0
 def test_reshuffle_window_fn_preserved(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
   expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((2, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((3, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((1, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((2, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]]
   expected_merged_windows = [
       TestWindowedValue(v, t - .001, [w]) for (v, t, w) in [
           ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
           ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
           ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)),
           ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]]
   before_reshuffle = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamp' >> beam.Map(
                           lambda v: TimestampedValue(v, v[1]))
                       | 'window' >> beam.WindowInto(Sessions(gap_size=2)))
   assert_that(before_reshuffle, equal_to(expected_windows),
               label='before_reshuffle', reify_windows=True)
   after_reshuffle = before_reshuffle | beam.Reshuffle()
   assert_that(after_reshuffle, equal_to(expected_windows),
               label='after_reshuffle', reify_windows=True)
   after_group = after_reshuffle | beam.GroupByKey()
   assert_that(after_group, equal_to(expected_merged_windows),
               label='after_group', reify_windows=True)
   pipeline.run()
    def expand(self, input):
        # [START EXERCISE 3]:
        # Docs: https://beam.apache.org/documentation/sdks/pydoc/2.5.0/apache_beam.io.gcp.pubsub.html

        # Determine whether to use files or topic based on options.
        if (not self.args.input == None) and (not self.args.input == ""):
            return (
                input
                # Read game events from files. See exercise2.
                # Don't forget to parse events or to include the TimestampedValue transform to assign timestamps to events.
                | beam.io.ReadFromText(self.args.input)
                | ParDo(ParseEventFn())
                | beam.Map(lambda element: TimestampedValue(
                    element, element[self.TIMESTAMP_ATTRIBUTE])))
        else:
            return (
                input
                # Read game events from Pub/Sub topic self.options.topic using custom timestamps, which
                # are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE.
                # Use ReadFromPubSub() and use parameters topic and timestamp_attribute.
                # https://beam.apache.org/documentation/sdks/python-streaming/
                | ReadFromPubSub(self.args.topic,
                                 timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

                # Parse the messages the same way as when they come from the text file. Note that we no
                # longer have to run WithTimestamps transform, as the timestamps are already set by
                # ReadFromPubSub.
                | ParDo(ParseEventFn()))
Exemplo n.º 4
0
 def test_timestamped_with_combiners(self):
     p = TestPipeline()
     result = (
         p
         # Create some initial test values.
         | 'start' >> Create([(k, k) for k in range(10)])
         # The purpose of the WindowInto transform is to establish a
         # FixedWindows windowing function for the PCollection.
         # It does not bucket elements into windows since the timestamps
         # from Create are not spaced 5 ms apart and very likely they all
         # fall into the same window.
         | 'w' >> WindowInto(FixedWindows(5))
         # Generate timestamped values using the values as timestamps.
         # Now there are values 5 ms apart and since Map propagates the
         # windowing function from input to output the output PCollection
         # will have elements falling into different 5ms windows.
         | Map(lambda (x, t): TimestampedValue(x, t))
         # We add a 'key' to each value representing the index of the
         # window. This is important since there is no guarantee of
         # order for the elements of a PCollection.
         | Map(lambda v: (v / 5, v)))
     # Sum all elements associated with a key and window. Although it
     # is called CombinePerKey it is really CombinePerKeyAndWindow the
     # same way GroupByKey is really GroupByKeyAndWindow.
     sum_per_window = result | CombinePerKey(sum)
     # Compute mean per key and window.
     mean_per_window = result | combiners.Mean.PerKey()
     assert_that(sum_per_window,
                 equal_to([(0, 10), (1, 35)]),
                 label='assert:sum')
     assert_that(mean_per_window,
                 equal_to([(0, 2.0), (1, 7.0)]),
                 label='assert:mean')
     p.run()
Exemplo n.º 5
0
    def expand(self, pcoll):
        start_min_filter = string_to_timestamp(self.start_min)
        end_min_filter = string_to_timestamp(self.stop_min)

        return (
            pcoll
            | 'ParseGameEvent' >> beam.ParDo(ParseEventFn())
            # Filter out data before and after the given times so that it is not
            # included in the calculations. As we collect data in batches (say, by
            # day), the batch for the day that we want to analyze could potentially
            # include some late-arriving data from the previous day. If so, we want
            # to weed it out. Similarly, if we include data from the following day
            # (to scoop up late-arriving events from the day we're analyzing), we
            # need to weed out events that fall after the time period we want to
            # analyze.
            | 'FilterStartTime' >> beam.Filter(
                lambda element: element['timestamp'] > start_min_filter)
            | 'FilterEndTime' >>
            beam.Filter(lambda element: element['timestamp'] < end_min_filter)
            # Add an element timestamp based on the event log, and apply fixed
            # windowing.
            # Convert element['timestamp'] into seconds as expected by
            # TimestampedValue.
            |
            'AddEventTimestamps' >> beam.Map(lambda element: TimestampedValue(
                element, element['timestamp'] / 1000.0))
            # Convert window_duration into seconds as expected by FixedWindows.
            | 'FixedWindowsTeam' >> beam.WindowInto(
                FixedWindows(size=self.window_duration * 60))
            # Extract and sum teamname/score pairs from the event data.
            | 'ExtractTeamScore' >> ExtractAndSumScore('team'))
Exemplo n.º 6
0
    def test_in_streaming_mode(self):
        timestamp_interval = 1
        offset = itertools.count(0)
        start_time = timestamp.Timestamp(0)
        window_duration = 6
        test_stream = (
            TestStream().advance_watermark_to(start_time).add_elements([
                TimestampedValue(x,
                                 next(offset) * timestamp_interval)
                for x in GroupIntoBatchesTest._create_test_data()
            ]).advance_watermark_to(start_time + (window_duration - 1)).
            advance_watermark_to(start_time + (window_duration + 1)).
            advance_watermark_to(start_time + GroupIntoBatchesTest.NUM_ELEMENTS
                                 ).advance_watermark_to_infinity())
        pipeline = TestPipeline()
        #  window duration is 6 and batch size is 5, so output batch size should be
        #  5 (flush because of batchSize reached)
        expected_0 = 5
        # there is only one element left in the window so batch size should be 1
        # (flush because of end of window reached)
        expected_1 = 1
        #  collection is 10 elements, there is only 4 left, so batch size should be
        #  4 (flush because end of collection reached)
        expected_2 = 4

        collection = pipeline | test_stream \
                     | WindowInto(FixedWindows(window_duration)) \
                     | util.GroupIntoBatches(GroupIntoBatchesTest.BATCH_SIZE)
        num_elements_in_batches = collection | beam.Map(len)

        result = pipeline.run()
        result.wait_until_finish()
        assert_that(num_elements_in_batches,
                    equal_to([expected_0, expected_1, expected_2]))
Exemplo n.º 7
0
        def CheckAggregation(inputs_and_expected, aggregation):
            # Split the test stream into a branch of to-be-processed elements, and
            # a branch of expected results.
            inputs, expected = (
                inputs_and_expected
                | beam.MapTuple(
                    lambda tag, value: beam.pvalue.TaggedOutput(tag, value),
                ).with_outputs('input', 'expect'))

            # Process the inputs with the given windowing to produce actual outputs.
            outputs = (
                inputs
                | beam.MapTuple(lambda key, value: TimestampedValue(
                    (key, value), value))
                | beam.WindowInto(window_fn,
                                  trigger=trigger_fn,
                                  accumulation_mode=accumulation_mode,
                                  timestamp_combiner=timestamp_combiner)
                | aggregation
                | beam.MapTuple(_windowed_value_info_map_fn)
                # Place outputs back into the global window to allow flattening
                # and share a single state in Check.
                | 'Global' >> beam.WindowInto(
                    beam.transforms.window.GlobalWindows()))
            # Feed both the expected and actual outputs to Check() for comparison.
            tagged_expected = (expected | beam.FlatMap(
                lambda value: [(key, ('expect', value)) for key in keys]))
            tagged_outputs = (
                outputs
                | beam.MapTuple(lambda key, value: (key, ('actual', value))))
            # pylint: disable=expression-not-assigned
            ([tagged_expected, tagged_outputs]
             | beam.Flatten()
             | beam.ParDo(Check(self.allow_out_of_order)))
Exemplo n.º 8
0
    def test_stream_payload_to_events(payload, coder):
        """Returns a TestStream Python event object from a TestStream event Proto.
    """
        if payload.HasField('element_event'):
            element_event = payload.element_event
            elements = [
                TimestampedValue(coder.decode(e.encoded_element),
                                 Timestamp(micros=e.timestamp))
                for e in element_event.elements
            ]
            return ElementEvent(timestamped_values=elements,
                                tag=element_event.tag)

        if payload.HasField('watermark_event'):
            watermark_event = payload.watermark_event
            return WatermarkEvent(
                Timestamp(micros=watermark_event.new_watermark),
                tag=watermark_event.tag)

        if payload.HasField('processing_time_event'):
            processing_time_event = payload.processing_time_event
            return ProcessingTimeEvent(
                Duration(micros=processing_time_event.advance_duration))

        raise RuntimeError(
            'Received a proto without the specified fields: {}'.format(
                payload))
Exemplo n.º 9
0
    def test_rewindow_regroup(self):
        with TestPipeline() as p:
            grouped = (p
                       | Create(range(5))
                       | Map(lambda t: TimestampedValue(('key', t), t))
                       | 'window' >> WindowInto(FixedWindows(5, offset=3))
                       | GroupByKey()
                       | MapTuple(lambda k, vs: (k, sorted(vs))))
            # Both of these group-and-ungroup sequences should be idempotent.
            regrouped1 = (grouped
                          | 'w1' >> WindowInto(FixedWindows(5, offset=3))
                          | 'g1' >> GroupByKey()
                          | FlatMapTuple(lambda k, vs: [(k, v) for v in vs]))
            regrouped2 = (grouped
                          | FlatMapTuple(lambda k, vs: [(k, v) for v in vs])
                          | 'w2' >> WindowInto(FixedWindows(5, offset=3))
                          | 'g2' >> GroupByKey()
                          | MapTuple(lambda k, vs: (k, sorted(vs))))
            with_windows = Map(lambda e, w=beam.DoFn.WindowParam: (e, w))
            expected = [(('key', [0, 1, 2]), IntervalWindow(-2, 3)),
                        (('key', [3, 4]), IntervalWindow(3, 8))]

            assert_that(grouped | 'ww' >> with_windows, equal_to(expected))
            assert_that(regrouped1 | 'ww1' >> with_windows,
                        equal_to(expected),
                        label='r1')
            assert_that(regrouped2 | 'ww2' >> with_windows,
                        equal_to(expected),
                        label='r2')
Exemplo n.º 10
0
    def process(
        self,
        e,
        timestamp=beam.DoFn.TimestampParam,
        prev_sample_time_state=beam.DoFn.StateParam(PREV_SAMPLE_TIME_STATE)):
        """Buffers elements until the end of the bundle.

    This buffers elements instead of emitting them immediately to keep elements
    that come in the same bundle to be outputted in the same bundle.
    """
        _, (element, timing_info) = e

        if isinstance(element, TestStreamFileHeader):
            self.header = element
        elif isinstance(element, WatermarkEvent):
            # WatermarkEvents come in with a watermark of MIN_TIMESTAMP. Fill in the
            # correct watermark from the runner here.
            element.new_watermark = timing_info.watermark.micros
            if element not in self.timing_events:
                self.timing_events.append(element)

        elif isinstance(element, ProcessingTimeEvent):
            # Because the runner holds the clock, calculate the processing time delta
            # here. The TestStream may have faked out the clock, and thus the
            # delta calculated in the SDK with time.time() will be wrong.
            prev_sample = next(prev_sample_time_state.read(), Timestamp())
            prev_sample_time_state.clear()
            prev_sample_time_state.add(timing_info.processing_time)

            advance_by = timing_info.processing_time - prev_sample

            element.advance_by = advance_by
            self.timing_events.append(element)
        else:
            self.elements.append(TimestampedValue(element, timestamp))
Exemplo n.º 11
0
    def test_to_list_and_to_dict2(self):
        with TestPipeline() as pipeline:
            pairs = [(1, 2), (3, 4), (5, 6)]
            timestamp = 0
            pcoll = pipeline | 'start-pairs' >> Create(pairs)
            result = pcoll | 'to dict' >> combine.ToDict()

            # Now for global combines without default
            timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
            windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))
            result_windowed = (
                windowed
                | 'to dict wo defaults' >> combine.ToDict().without_defaults())

            def matcher():
                def match(actual):
                    equal_to([1])([len(actual)])
                    equal_to(pairs)(actual[0].items())

                return match

            assert_that(result, matcher())
            assert_that(result_windowed,
                        matcher(),
                        label='to-dict-wo-defaults')
Exemplo n.º 12
0
    def test_global_sample(self):
        def is_good_sample(actual):
            assert len(actual) == 1
            assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual

        with TestPipeline() as pipeline:
            timestamp = 0
            pcoll = pipeline | 'start' >> Create([1, 1, 2, 2])

            # Now for global combines without default
            timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
            windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))

            for ix in range(9):
                assert_that(
                    pcoll
                    | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3),
                    is_good_sample,
                    label='check-%d' % ix)
                result_windowed = (
                    windowed
                    | 'sample-wo-defaults-%d' % ix >>
                    combine.Sample.FixedSizeGlobally(3).without_defaults())
                assert_that(result_windowed,
                            is_good_sample,
                            label='check-wo-defaults-%d' % ix)
Exemplo n.º 13
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)])
                   .advance_watermark_to_infinity())

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('k', ['a', 'b', 'c']),
            ('k', ['late']),
        ],
        window.IntervalWindow(15, 30): [
            ('k', ['d', 'e']),
        ],
        window.IntervalWindow(300, 315): [
            ('k', ['last']),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
Exemplo n.º 14
0
def _build_a_test_stream_pipeline():
    test_stream = (TestStream().advance_watermark_to(0).add_elements([
        TimestampedValue('a', 1)
    ]).advance_processing_time(5).advance_watermark_to_infinity())
    p = beam.Pipeline(runner=interactive_runner.InteractiveRunner())
    events = p | test_stream  # pylint: disable=possibly-unused-variable
    ib.watch(locals())
    return p
Exemplo n.º 15
0
 def process(self, element):
     """
     Parse the timestamp and add it to the datapoints
     """
     t = element[1][timestamp_column][0]
     timestamp = datetime.fromtimestamp(t, pytz.utc)
     unix_timestamp = Timestamp.from_utc_datetime(timestamp)
     yield TimestampedValue(element, unix_timestamp)
Exemplo n.º 16
0
 def test_timestamp(self):
   l = [TimestampedValue('a', 100),
        TimestampedValue('b', 200),
        TimestampedValue('c', 300)]
   expected = [TestWindowedValue('a', 100, [GlobalWindow()]),
               TestWindowedValue('b', 200, [GlobalWindow()]),
               TestWindowedValue('c', 300, [GlobalWindow()])]
   with TestPipeline() as p:
     # Map(lambda x: x) PTransform is added after Create here, because when
     # a PCollection of TimestampedValues is created with Create PTransform,
     # the timestamps are not assigned to it. Adding a Map forces the
     # PCollection to go through a DoFn so that the PCollection consists of
     # the elements with timestamps assigned to them instead of a PCollection
     # of TimestampedValue(element, timestamp).
     pc = p | beam.Create(l) | beam.Map(lambda x: x)
     reified_pc = pc | util.Reify.Timestamp()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
Exemplo n.º 17
0
 def test_windowing(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create([1, 2, 100, 101, 102])
                | beam.Map(lambda t: TimestampedValue(('k', t), t))
                | beam.WindowInto(beam.transforms.window.Sessions(10))
                | beam.GroupByKey()
                | beam.Map(lambda (k, vs): (k, sorted(vs))))
         assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
Exemplo n.º 18
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))
    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])], result)
Exemplo n.º 19
0
 def test_timestamped_value(self):
   with TestPipeline() as p:
     result = (p
               | 'start' >> Create([(k, k) for k in range(10)])
               | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1]))
               | 'w' >> WindowInto(FixedWindows(5))
               | Map(lambda v: ('key', v))
               | GroupByKey())
     assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                   ('key', [5, 6, 7, 8, 9])]))
Exemplo n.º 20
0
 def test_basic_test_stream(self):
     test_stream = (TestStream().advance_watermark_to(0).add_elements([
         'a', WindowedValue('b', 3, []),
         TimestampedValue('c', 6)
     ]).advance_processing_time(10).advance_watermark_to(8).add_elements(
         ['d']).advance_watermark_to_infinity())
     self.assertEqual(test_stream.events, [
         WatermarkEvent(0),
         ElementEvent([
             TimestampedValue('a', 0),
             TimestampedValue('b', 3),
             TimestampedValue('c', 6),
         ]),
         ProcessingTimeEvent(10),
         WatermarkEvent(8),
         ElementEvent([
             TimestampedValue('d', 8),
         ]),
         WatermarkEvent(timestamp.MAX_TIMESTAMP),
     ])
Exemplo n.º 21
0
 def expand(self, pbegin):
     result = (pbegin
               | 'ImpulseElement' >> beam.Create(
                   [(self.start_ts, self.stop_ts, self.interval)])
               | 'GenSequence' >> beam.ParDo(ImpulseSeqGenDoFn())
               | 'MapToTimestamped' >>
               beam.Map(lambda tt: TimestampedValue(tt, tt)))
     if self.apply_windowing:
         result = result | 'ApplyWindowing' >> beam.WindowInto(
             window.FixedWindows(self.interval))
     return result
Exemplo n.º 22
0
  def test_gbk_execution_after_watermark_trigger(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements([TimestampedValue('a', 11)])
                   .advance_watermark_to(20)
                   .add_elements([TimestampedValue('b', 21)])
                   .advance_watermark_to_infinity())

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p            # pylint: disable=unused-variable
               | test_stream
               | beam.WindowInto(
                   FixedWindows(15),
                   trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING)
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('k', ['a']),
            ('k', [])
        ],
        window.IntervalWindow(15, 30): [
            ('k', ['b']),
            ('k', [])
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
Exemplo n.º 23
0
 def test_rewindow(self):
   with TestPipeline() as p:
     result = (p
               | Create([(k, k) for k in range(10)])
               | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1]))
               | 'window' >> WindowInto(SlidingWindows(period=2, size=6))
               # Per the model, each element is now duplicated across
               # three windows. Rewindowing must preserve this duplication.
               | 'rewindow' >> WindowInto(FixedWindows(5))
               | 'rewindow2' >> WindowInto(FixedWindows(5))
               | Map(lambda v: ('key', v))
               | GroupByKey())
     assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)),
                                   ('key', sorted([5, 6, 7, 8, 9] * 3))]))
Exemplo n.º 24
0
  def test_window_param(self):
    class TestDoFn(DoFn):
      def process(self, element, window=DoFn.WindowParam):
        yield (element, (float(window.start), float(window.end)))

    pipeline = TestPipeline()
    pcoll = (pipeline
             | Create([1, 7])
             | Map(lambda x: TimestampedValue(x, x))
             | WindowInto(windowfn=SlidingWindows(10, 5))
             | ParDo(TestDoFn()))
    assert_that(pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)),
                                 (7, (0, 10)), (7, (5, 15))]))
    pipeline.run()
Exemplo n.º 25
0
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)])
                   .advance_watermark_to_infinity())  # yapf: disable

    class RecordFn(beam.DoFn):
      def process(
          self,
          element=beam.DoFn.ElementParam,
          timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    with TestPipeline(options=options) as p:
      my_record_fn = RecordFn()
      records = p | test_stream | beam.ParDo(my_record_fn)

      assert_that(
          records,
          equal_to([
              ('a', timestamp.Timestamp(10)),
              ('b', timestamp.Timestamp(10)),
              ('c', timestamp.Timestamp(10)),
              ('d', timestamp.Timestamp(20)),
              ('e', timestamp.Timestamp(20)),
              ('late', timestamp.Timestamp(12)),
              ('last', timestamp.Timestamp(310)),
          ]))
def run(options):

    visit_args = options.view_as(PortVisitsOptions)
    cloud_args = options.view_as(GoogleCloudOptions)

    p = beam.Pipeline(options=options)

    start_date = datetime.datetime.strptime(
        visit_args.start_date, '%Y-%m-%d').replace(tzinfo=pytz.utc)
    start_window = start_date - datetime.timedelta(
        days=visit_args.start_padding)
    end_date = datetime.datetime.strptime(visit_args.end_date,
                                          '%Y-%m-%d').replace(tzinfo=pytz.utc)

    dataset, table = visit_args.output_table.split('.')

    sink = WriteToBigQueryDatePartitioned(
        temp_gcs_location=cloud_args.temp_location,
        dataset=dataset,
        table=table,
        project=cloud_args.project,
        write_disposition="WRITE_TRUNCATE",
        schema=build_visit_schema())

    queries = VisitEvent.create_queries(visit_args.events_table, start_window,
                                        end_date)

    sources = [(p | "Read_{}".format(i) >> beam.io.Read(
        beam.io.gcp.bigquery.BigQuerySource(query=x)))
               for (i, x) in enumerate(queries)]

    tagged_records = (sources
                      | beam.Flatten()
                      | beam.Map(from_msg)
                      | CreatePortVisits()
                      | "FilterVisits" >> Filter(lambda x: start_date.date(
                      ) <= x.end_timestamp.date() <= end_date.date())
                      | Map(lambda x: TimestampedValue(
                          visit_to_msg(x), _datetime_to_s(x.end_timestamp)))
                      | sink)

    result = p.run()

    success_states = set(
        [PipelineState.DONE, PipelineState.RUNNING, PipelineState.UNKNOWN])

    logging.info('returning with result.state=%s' % result.state)
    return 0 if result.state in success_states else 1
Exemplo n.º 27
0
  def test_test_stream_errors(self):
    with self.assertRaises(
        AssertionError, msg=('Watermark must strictly-monotonically advance.')):
      _ = (TestStream().advance_watermark_to(5).advance_watermark_to(4))

    with self.assertRaises(
        AssertionError,
        msg=('Must advance processing time by positive amount.')):
      _ = (TestStream().advance_processing_time(-1))

    with self.assertRaises(
        AssertionError,
        msg=('Element timestamp must be before timestamp.MAX_TIMESTAMP.')):
      _ = (
          TestStream().add_elements(
              [TimestampedValue('a', timestamp.MAX_TIMESTAMP)]))
Exemplo n.º 28
0
def side_input_slow_update(
    src_file_pattern,
    first_timestamp,
    last_timestamp,
    interval,
    sample_main_input_elements,
    main_input_windowing_interval):
  # [START SideInputSlowUpdateSnip1]
  from apache_beam.transforms.periodicsequence import PeriodicImpulse
  from apache_beam.transforms.window import TimestampedValue
  from apache_beam.transforms import window

  # from apache_beam.utils.timestamp import MAX_TIMESTAMP
  # last_timestamp = MAX_TIMESTAMP to go on indefninitely

  # Any user-defined function.
  # cross join is used as an example.
  def cross_join(left, rights):
    for x in rights:
      yield (left, x)

  # Create pipeline.
  pipeline_options = PipelineOptions()
  p = beam.Pipeline(options=pipeline_options)
  side_input = (
      p
      | 'PeriodicImpulse' >> PeriodicImpulse(
          first_timestamp, last_timestamp, interval, True)
      | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x))
      | 'ReadFromFile' >> beam.io.ReadAllFromText())

  main_input = (
      p
      | 'MpImpulse' >> beam.Create(sample_main_input_elements)
      |
      'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src))
      | 'WindowMpInto' >> beam.WindowInto(
          window.FixedWindows(main_input_windowing_interval)))

  result = (
      main_input
      | 'ApplyCrossJoin' >> beam.FlatMap(
          cross_join, rights=beam.pvalue.AsIter(side_input)))
  # [END SideInputSlowUpdateSnip1]

  return p, result
Exemplo n.º 29
0
 def test_after_count(self):
   p = Pipeline('DirectRunner')
   result = (p
             | beam.Create([1, 2, 3, 4, 5, 10, 11])
             | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
             | beam.Map(lambda (k, t): TimestampedValue((k, t), t))
             | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3),
                               accumulation_mode=AccumulationMode.DISCARDING)
             | beam.GroupByKey()
             | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
   assert_that(result, equal_to(
       {
           'A-5': {1, 2, 3, 4, 5},
           # A-10, A-11 never emitted due to AfterCount(3) never firing.
           'B-4': {6, 7, 8, 9},
           'B-3': {10, 15, 16},
       }.iteritems()))
Exemplo n.º 30
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()