Пример #1
0
  def test_fixed_windows(self):
    # Test windows with offset: 2, 7, 12, 17, ...
    windowfn = FixedWindows(size=5, offset=2)
    self.assertEqual([IntervalWindow(7, 12)],
                     windowfn.assign(context('v', 7)))
    self.assertEqual([IntervalWindow(7, 12)],
                     windowfn.assign(context('v', 11)))
    self.assertEqual([IntervalWindow(12, 17)],
                     windowfn.assign(context('v', 12)))

    # Test windows without offset: 0, 5, 10, 15, ...
    windowfn = FixedWindows(size=5)
    self.assertEqual([IntervalWindow(5, 10)],
                     windowfn.assign(context('v', 5)))
    self.assertEqual([IntervalWindow(5, 10)],
                     windowfn.assign(context('v', 9)))
    self.assertEqual([IntervalWindow(10, 15)],
                     windowfn.assign(context('v', 10)))

    # Test windows with offset out of range.
    windowfn = FixedWindows(size=5, offset=12)
    self.assertEqual([IntervalWindow(7, 12)],
                     windowfn.assign(context('v', 11)))
Пример #2
0
 def test_equal_to_per_window_passes(self):
     start = int(MIN_TIMESTAMP.micros // 1e6) - 5
     end = start + 20
     expected = {
         window.IntervalWindow(start, end): [('k', [1])],
     }
     with TestPipeline(options=StandardOptions(streaming=True)) as p:
         assert_that(
             (p
              | Create([1])
              | beam.WindowInto(
                  FixedWindows(20),
                  trigger=trigger.AfterWatermark(),
                  accumulation_mode=trigger.AccumulationMode.DISCARDING)
              | beam.Map(lambda x: ('k', x))
              | beam.GroupByKey()),
             equal_to_per_window(expected),
             reify_windows=True)
Пример #3
0
 def test_repeatedly_after_first(self):
     self.run_trigger_simple(
         FixedWindows(100),  # pyformat break
         Repeatedly(AfterAny(AfterCount(3), AfterWatermark())),
         AccumulationMode.ACCUMULATING,
         zip(range(7), 'abcdefg'),
         {
             IntervalWindow(0, 100): [
                 set('abc'),
                 set('abcdef'),
                 set('abcdefg'),
                 set('abcdefgx'),
                 set('abcdefgxy'),
                 set('abcdefgxyz')
             ]
         },
         1,
         late_data=zip(range(3), 'xyz'))
    def _pipeline_runner():
        with beam.Pipeline(runner=DirectRunner()) as p:
            ts = TestStream().advance_watermark_to(0)
            all_elements = iter(range(size))
            watermark = 0
            while True:
                next_batch = list(itertools.islice(all_elements, 100))
                if not next_batch:
                    break
                ts = ts.add_elements([(i, random.randint(0, 1000))
                                      for i in next_batch])
                watermark = watermark + 100
                ts = ts.advance_watermark_to(watermark)
            ts = ts.advance_watermark_to_infinity()

            input_pc = p | ts | WindowInto(FixedWindows(100))
            for i in range(NUM_PARALLEL_STAGES):
                _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
Пример #5
0
  def test_buffering_timer_in_fixed_window_streaming(self):
    window_duration = 6
    max_buffering_duration_secs = 100

    start_time = timestamp.Timestamp(0)
    test_stream = (
        TestStream().add_elements([
            TimestampedValue(value, start_time + i) for i,
            value in enumerate(GroupIntoBatchesTest._create_test_data())
        ]).advance_processing_time(150).advance_watermark_to(
            start_time + window_duration).advance_watermark_to(
                start_time + window_duration +
                1).advance_watermark_to_infinity())

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0).
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | "fixed window" >> WindowInto(FixedWindows(window_duration))
          | util.GroupIntoBatches(
              GroupIntoBatchesTest.BATCH_SIZE,
              max_buffering_duration_secs,
              fake_clock)
          | "count elements in batch" >> Map(lambda x: (None, len(x[1])))
          | "global window" >> WindowInto(GlobalWindows())
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # Window duration is 6 and batch size is 5, so output batch size
      # should be 5 (flush because of batch size reached).
      expected_0 = 5
      # There is only one element left in the window so batch size
      # should be 1 (flush because of max buffering duration reached).
      expected_1 = 1
      # Collection has 10 elements, there are only 4 left, so batch size should
      # be 4 (flush because of end of window reached).
      expected_2 = 4
      assert_that(
          num_elements_per_batch,
          equal_to([expected_0, expected_1, expected_2]),
          "assert2")
Пример #6
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
Пример #7
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (
        TestStream().advance_watermark_to(10).add_elements([
            'a', 'b', 'c'
        ]).advance_watermark_to(20).add_elements(['d']).add_elements([
            'e'
        ]).advance_processing_time(10).advance_watermark_to(300).add_elements([
            TimestampedValue('late', 12)
        ]).add_elements([TimestampedValue('last', 310)
                         ]).advance_watermark_to_infinity())

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (
        p
        | test_stream
        | beam.WindowInto(FixedWindows(15), allowed_lateness=300)
        | beam.Map(lambda x: ('k', x))
        | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('k', ['a', 'b', 'c']),
            ('k', ['late']),
        ],
        window.IntervalWindow(15, 30): [
            ('k', ['d', 'e']),
        ],
        window.IntervalWindow(300, 315): [
            ('k', ['last']),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
Пример #8
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))
    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])], result)
    def test_fixed_windows_simple_watermark(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0),
                             tsv('k1', 2, 0), tsv('k2', 2, 0)])
              .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)])
              .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)])
              .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)])
              .advance_watermark_to(1)
              .add_elements([tsv('k1', 6, 0)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k2', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k1', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k2', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k1', IntervalWindow(0, 1), [6]),  # After the watermark
                ]))
Пример #10
0
    def test_top(self):
        with TestPipeline() as pipeline:
            timestamp = 0

            # First for global combines.
            pcoll = pipeline | 'start' >> Create(
                [6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
            result_top = pcoll | 'top' >> combine.Top.Largest(5)
            result_bot = pcoll | 'bot' >> combine.Top.Smallest(4)
            assert_that(result_top,
                        equal_to([[9, 6, 6, 5, 3]]),
                        label='assert:top')
            assert_that(result_bot,
                        equal_to([[0, 1, 1, 1]]),
                        label='assert:bot')

            # Now for global combines without default
            timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
            windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))
            result_windowed_top = windowed | 'top-wo-defaults' >> combine.Top.Largest(
                5, has_defaults=False)
            result_windowed_bot = (windowed
                                   | 'bot-wo-defaults' >> combine.Top.Smallest(
                                       4, has_defaults=False))
            assert_that(result_windowed_top,
                        equal_to([[9, 6, 6, 5, 3]]),
                        label='assert:top-wo-defaults')
            assert_that(result_windowed_bot,
                        equal_to([[0, 1, 1, 1]]),
                        label='assert:bot-wo-defaults')

            # Again for per-key combines.
            pcoll = pipeline | 'start-perkey' >> Create(
                [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
            result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey(
                5)
            result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey(
                4)
            assert_that(result_key_top,
                        equal_to([('a', [9, 6, 6, 5, 3])]),
                        label='key:top')
            assert_that(result_key_bot,
                        equal_to([('a', [0, 1, 1, 1])]),
                        label='key:bot')
Пример #11
0
    def test_builtin_combines(self):
        with TestPipeline() as pipeline:

            vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
            mean = sum(vals) / float(len(vals))
            size = len(vals)
            timestamp = 0

            # First for global combines.
            pcoll = pipeline | 'start' >> Create(vals)
            result_mean = pcoll | 'mean' >> combine.Mean.Globally()
            result_count = pcoll | 'count' >> combine.Count.Globally()
            assert_that(result_mean, equal_to([mean]), label='assert:mean')
            assert_that(result_count, equal_to([size]), label='assert:size')

            # Now for global combines without default
            timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
            windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))
            result_windowed_mean = (windowed
                                    | 'mean-wo-defaults' >>
                                    combine.Mean.Globally().without_defaults())
            assert_that(result_windowed_mean,
                        equal_to([mean]),
                        label='assert:mean-wo-defaults')
            result_windowed_count = (
                windowed
                | 'count-wo-defaults' >>
                combine.Count.Globally().without_defaults())
            assert_that(result_windowed_count,
                        equal_to([size]),
                        label='assert:count-wo-defaults')

            # Again for per-key combines.
            pcoll = pipeline | 'start-perkey' >> Create([('a', x)
                                                         for x in vals])
            result_key_mean = pcoll | 'mean-perkey' >> combine.Mean.PerKey()
            result_key_count = pcoll | 'count-perkey' >> combine.Count.PerKey()
            assert_that(result_key_mean,
                        equal_to([('a', mean)]),
                        label='key:mean')
            assert_that(result_key_count,
                        equal_to([('a', size)]),
                        label='key:size')
Пример #12
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # Read in the CSV file
        lines = (p
                 | 'ReadFromPubSub' >> ReadFromPubSub(
                     topic=known_args.input).with_output_types(bytes)
                 | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8'))
                 | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn()))

        windows = (lines
                   | 'WindowInto' >> beam.WindowInto(FixedWindows(30, 0))
                   | 'SumValues' >> beam.CombinePerKey(sum))

        # Format rows and write to BigQuery.
        (windows
         | 'ConvertToDictionary' >> beam.Map(lambda row: {
             'id': row[0],
             'total': row[1]
         })
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='id:INTEGER, total:INTEGER',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Пример #13
0
 def test_after_count(self):
     p = TestPipeline()
     result = (p
               | beam.Create([1, 2, 3, 4, 5, 10, 11])
               | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
               | beam.Map(lambda (k, t): TimestampedValue((k, t), t))
               | beam.WindowInto(
                   FixedWindows(10),
                   trigger=AfterCount(3),
                   accumulation_mode=AccumulationMode.DISCARDING)
               | beam.GroupByKey()
               | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
     assert_that(
         result,
         equal_to({
             'A-5': {1, 2, 3, 4, 5},
             # A-10, A-11 never emitted due to AfterCount(3) never firing.
             'B-4': {6, 7, 8, 9},
             'B-3': {10, 15, 16},
         }.iteritems()))
Пример #14
0
  def test_gbk_execution_after_watermark_trigger(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements([TimestampedValue('a', 11)])
                   .advance_watermark_to(20)
                   .add_elements([TimestampedValue('b', 21)])
                   .advance_watermark_to_infinity())

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p            # pylint: disable=unused-variable
               | test_stream
               | beam.WindowInto(
                   FixedWindows(15),
                   trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING)
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('k', ['a']),
            ('k', [])
        ],
        window.IntervalWindow(15, 30): [
            ('k', ['b']),
            ('k', [])
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
Пример #15
0
    def test_with_trigger_window_that_finish(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)])
              .add_elements([tsv('k1', 3, 0)])
              .advance_watermark_to(2)
              .add_elements([tsv('k1', 6, 0)])  # This value is discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              triggerfn=AfterWatermark(),
                              allowed_lateness=0,
                              accumulation_mode=AccumulationMode.DISCARDING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                ]))
Пример #16
0
def run_pipeline_with_micro_batches(inference_type,
                                    pubsub_topic,
                                    runner,
                                    args=None):

    options = beam.pipeline.PipelineOptions(flags=[], **args)

    pipeline = beam.Pipeline(runner, options=options)

    (pipeline
     | 'Read from PubSub' >> beam.io.ReadStringsFromPubSub(topic=pubsub_topic)
     | 'Micro-batch - Window Size: {} Seconds'.format(WINDOW_SIZE) >>
     beam.WindowInto(FixedWindows(size=WINDOW_SIZE))
     | 'Estimate Targets - {}'.format(inference_type) >>
     beam.FlatMap(lambda messages: estimate(messages, inference_type))
     | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
         project=PROJECT,
         dataset=DATASET,
         table=TABLE,
         schema=schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))

    pipeline.run()
Пример #17
0
  def test_to_set(self):
    pipeline = TestPipeline()
    the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    timestamp = 0
    pcoll = pipeline | 'start' >> Create(the_list)
    result = pcoll | 'to set' >> combine.ToSet()

    # Now for global combines without default
    timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
    windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))
    result_windowed = (
        windowed
        | 'to set wo defaults' >> combine.ToSet().without_defaults())

    def matcher(expected):
      def match(actual):
        equal_to(expected[0])(actual[0])

      return match

    assert_that(result, matcher(set(the_list)))
    assert_that(
        result_windowed, matcher(set(the_list)), label='to-set-wo-defaults')
Пример #18
0
  def test_model_setting_trigger(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements(
              ['a', 'a', 'a', 'b',
               'b']).advance_watermark_to(70).advance_processing_time(600))
      pcollection = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=AfterProcessingTime(10 * 60),
              accumulation_mode=AccumulationMode.DISCARDING)
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2)]))
Пример #19
0
  def test_to_list_and_to_dict2(self):
    with TestPipeline() as pipeline:
      pairs = [(1, 2), (3, 4), (5, 6)]
      timestamp = 0
      pcoll = pipeline | 'start-pairs' >> Create(pairs)
      result = pcoll | 'to dict' >> combine.ToDict()

      # Now for global combines without default
      timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
      windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))
      result_windowed = (
          windowed
          | 'to dict wo defaults' >> combine.ToDict().without_defaults())

      def matcher():
        def match(actual):
          equal_to([1])([len(actual)])
          equal_to(pairs)(actual[0].items())

        return match

      assert_that(result, matcher())
      assert_that(result_windowed, matcher(), label='to-dict-wo-defaults')
Пример #20
0
  def test_after_count(self):
    with TestPipeline() as p:
      def construct_timestamped(k_t):
        return TimestampedValue((k_t[0], k_t[1]), k_t[1])

      def format_result(k_v):
        return ('%s-%s' % (k_v[0], len(k_v[1])), set(k_v[1]))

      result = (p
                | beam.Create([1, 2, 3, 4, 5, 10, 11])
                | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
                | beam.Map(construct_timestamped)
                | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3),
                                  accumulation_mode=AccumulationMode.DISCARDING)
                | beam.GroupByKey()
                | beam.Map(format_result))
      assert_that(result, equal_to(
          {
              'A-5': {1, 2, 3, 4, 5},
              # A-10, A-11 never emitted due to AfterCount(3) never firing.
              'B-4': {6, 7, 8, 9},
              'B-3': {10, 15, 16},
          }.iteritems()))
Пример #21
0
  def test_gbk_execution_after_watermark_trigger(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a'])
                   .advance_watermark_to(20))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result   # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p            # pylint: disable=unused-variable
               | test_stream
               | beam.WindowInto(
                   FixedWindows(15),
                   trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING)
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed.
    # assert_that(records, equal_to([
    #     ('k', ['a']), ('k', [])]))

    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('k', ['a']), ('k', [])], result)
Пример #22
0
  def test_global_sample(self):
    def is_good_sample(actual):
      assert len(actual) == 1
      assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual

    with TestPipeline() as pipeline:
      timestamp = 0
      pcoll = pipeline | 'start' >> Create([1, 1, 2, 2])

      # Now for global combines without default
      timestamped = pcoll | Map(lambda x: TimestampedValue(x, timestamp))
      windowed = timestamped | 'window' >> WindowInto(FixedWindows(60))

      for ix in range(9):
        assert_that(
            pcoll | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3),
            is_good_sample,
            label='check-%d' % ix)
        result_windowed = (
            windowed
            | 'sample-wo-defaults-%d' % ix >>
            combine.Sample.FixedSizeGlobally(3).without_defaults())
        assert_that(
            result_windowed, is_good_sample, label='check-wo-defaults-%d' % ix)
Пример #23
0
    def test_streaming_different_file_types(self):
        dir = self._new_tempdir()
        input = iter(WriteFilesTest.SIMPLE_COLLECTION)
        ts = (TestStream().advance_watermark_to(0).add_elements(
            [next(input), next(input)]).advance_watermark_to(10).add_elements(
                [next(input),
                 next(input)]).advance_watermark_to(20).add_elements([
                     next(input), next(input)
                 ]).advance_watermark_to(30).add_elements([
                     next(input), next(input)
                 ]).advance_watermark_to(40).advance_watermark_to_infinity())

        def no_colon_file_naming(*args):
            file_name = fileio.destination_prefix_naming()(*args)
            return file_name.replace(':', '_')

        with TestPipeline() as p:
            _ = (p
                 | ts
                 | beam.WindowInto(FixedWindows(10))
                 | beam.io.fileio.WriteToFiles(
                     path=dir,
                     destination=lambda record: record['foundation'],
                     sink=lambda dest:
                     (WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
                      if dest == 'apache' else WriteFilesTest.JsonSink()),
                     file_naming=no_colon_file_naming,
                     max_writers_per_bundle=0,
                 ))

        with TestPipeline() as p:
            cncf_files = (p
                          | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
                          | "CncfFileNames" >> beam.Map(lambda fm: fm.path))

            apache_files = (p
                            | "MatchApache" >> fileio.MatchFiles(
                                FileSystems.join(dir, 'apache*'))
                            |
                            "ApacheFileNames" >> beam.Map(lambda fm: fm.path))

            assert_that(
                cncf_files,
                matches_all([
                    stringmatches.matches_regexp(
                        '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10--.*'),
                    stringmatches.matches_regexp(
                        '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20--.*'),
                    stringmatches.matches_regexp(
                        '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30--.*'),
                    stringmatches.matches_regexp(
                        '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40--.*')
                ]),
                label='verifyCNCFFiles')

            assert_that(
                apache_files,
                matches_all([
                    stringmatches.matches_regexp(
                        '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10--.*'
                    ),
                    stringmatches.matches_regexp(
                        '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20--.*'
                    ),
                    stringmatches.matches_regexp(
                        '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30--.*'
                    ),
                    stringmatches.matches_regexp(
                        '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40--.*')
                ]),
                label='verifyApacheFiles')
Пример #24
0
    def test_streaming_complex_timing(self):
        # Use state on the TestCase class, since other references would be pickled
        # into a closure and not have the desired side effects.
        #
        # TODO(BEAM-5295): Use assert_that after it works for the cases here in
        # streaming mode.
        WriteFilesTest.all_records = []

        dir = '%s%s' % (self._new_tempdir(), os.sep)

        # Setting up the input (TestStream)
        ts = TestStream().advance_watermark_to(0)
        for elm in WriteFilesTest.LARGER_COLLECTION:
            timestamp = int(elm)

            ts.add_elements([('key', '%s' % elm)])
            if timestamp % 5 == 0 and timestamp != 0:
                # TODO(BEAM-3759): Add many firings per window after getting PaneInfo.
                ts.advance_processing_time(5)
                ts.advance_watermark_to(timestamp)
        ts.advance_watermark_to_infinity()

        def no_colon_file_naming(*args):
            file_name = fileio.destination_prefix_naming()(*args)
            return file_name.replace(':', '_')

        # The pipeline that we are testing
        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            res = (p
                   | ts
                   | beam.WindowInto(
                       FixedWindows(10),
                       trigger=trigger.AfterWatermark(),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | beam.GroupByKey()
                   | beam.FlatMap(lambda x: x[1]))
            # Triggering after 5 processing-time seconds, and on the watermark. Also
            # discarding old elements.

            _ = (res
                 | beam.io.fileio.WriteToFiles(
                     path=dir,
                     file_naming=no_colon_file_naming,
                     max_writers_per_bundle=0)
                 | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name))
                 | beam.ParDo(self.record_dofn()))

        # Verification pipeline
        with TestPipeline() as p:
            files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*')))

            file_names = (files | beam.Map(lambda fm: fm.path))

            file_contents = (
                files
                | beam.io.fileio.ReadMatches()
                | beam.Map(lambda rf: (rf.metadata.path, rf.read_utf8().strip(
                ).split('\n'))))

            content = (file_contents
                       | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]]))

            assert_that(file_names,
                        equal_to(WriteFilesTest.all_records),
                        label='AssertFilesMatch')
            assert_that(content,
                        matches_all(WriteFilesTest.LARGER_COLLECTION),
                        label='AssertContentsMatch')
    def test_multiple_outputs_with_watermark_advancement(self):
        """Tests that the TestStream can independently control output watermarks."""

        # Purposely set the watermark of numbers to 20 then letters to 5 to test
        # that the watermark advancement is per PCollection.
        #
        # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
        # emitted at different times so that they will have different windows. The
        # watermark advancement is checked by checking their windows. If the
        # watermark does not advance, then the windows will be [-inf, -inf). If the
        # windows do not advance separately, then the PCollections will both
        # windowed in [15, 30).
        letters_elements = [
            TimestampedValue('a', 6),
            TimestampedValue('b', 7),
            TimestampedValue('c', 8),
        ]
        numbers_elements = [
            TimestampedValue('1', 21),
            TimestampedValue('2', 22),
            TimestampedValue('3', 23),
        ]
        test_stream = (TestStream().advance_watermark_to(
            0, tag='letters').advance_watermark_to(
                0, tag='numbers').advance_watermark_to(
                    20, tag='numbers').advance_watermark_to(
                        5, tag='letters').add_elements(
                            letters_elements,
                            tag='letters').advance_watermark_to(
                                10, tag='letters').add_elements(
                                    numbers_elements,
                                    tag='numbers').advance_watermark_to(
                                        30, tag='numbers'))

        options = StandardOptions(streaming=True)
        p = TestPipeline(is_integration_test=True, options=options)

        main = p | test_stream

        # Use an AfterWatermark trigger with an early firing to test that the
        # watermark is advancing properly and that the element is being emitted in
        # the correct window.
        letters = (
            main['letters']
            | 'letter windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'letter with key' >> beam.Map(lambda x: ('k', x))
            | 'letter gbk' >> beam.GroupByKey())

        numbers = (
            main['numbers']
            | 'number windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'number with key' >> beam.Map(lambda x: ('k', x))
            | 'number gbk' >> beam.GroupByKey())

        # The letters were emitted when the watermark was at 5, thus we expect to
        # see the elements in the [0, 15) window. We used an early trigger to make
        # sure that the ON_TIME empty pane was also emitted with a TestStream.
        # This pane has no data because of the early trigger causes the elements to
        # fire before the end of the window and because the accumulation mode
        # discards any data after the trigger fired.
        expected_letters = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', []),
            ],
        }

        # Same here, except the numbers were emitted at watermark = 20, thus they
        # are in the [15, 30) window.
        expected_numbers = {
            window.IntervalWindow(15, 30): [
                ('k', ['1', '2', '3']),
                ('k', []),
            ],
        }
        assert_that(letters,
                    equal_to_per_window(expected_letters),
                    label='letters assert per window')
        assert_that(numbers,
                    equal_to_per_window(expected_numbers),
                    label='numbers assert per window')

        p.run()
Пример #26
0
  def test_fixed_windows(self):
    # Test windows with offset: 2, 7, 12, 17, ...
    windowfn = FixedWindows(size=5, offset=2)
    self.assertEqual([IntervalWindow(7, 12)],
                     windowfn.assign(context('v', 7)))
    self.assertEqual([IntervalWindow(7, 12)],
                     windowfn.assign(context('v', 11)))
    self.assertEqual([IntervalWindow(12, 17)],
                     windowfn.assign(context('v', 12)))

    # Test windows without offset: 0, 5, 10, 15, ...
    windowfn = FixedWindows(size=5)
    self.assertEqual([IntervalWindow(5, 10)],
                     windowfn.assign(context('v', 5)))
    self.assertEqual([IntervalWindow(5, 10)],
                     windowfn.assign(context('v', 9)))
    self.assertEqual([IntervalWindow(10, 15)],
                     windowfn.assign(context('v', 10)))

    # Test windows with offset out of range.
    windowfn = FixedWindows(size=5, offset=12)
    self.assertEqual([IntervalWindow(7, 12)],
                     windowfn.assign(context('v', 11)))
Пример #27
0
def run_pipeline_with_micro_batches(inference_type, project,
                                    pubsub_topic, pubsub_subscription,
                                    bq_dataset, bq_table,
                                    window_size, runner, args=None):

    prepare_steaming_source(project, pubsub_topic, pubsub_subscription)
    prepare_steaming_sink(project, bq_dataset, bq_table)
    pubsub_subscription_url = "projects/{}/subscriptions/{}".format(project, pubsub_subscription)
    options = beam.pipeline.PipelineOptions(flags=[], **args)

    pipeline = beam.Pipeline(runner, options=options)
    (
            pipeline
            | 'Read from PubSub' >> beam.io.ReadStringsFromPubSub(subscription=pubsub_subscription_url, id_label="source_id")
            | 'Micro-batch - Window Size: {} Seconds'.format(window_size) >> beam.WindowInto(FixedWindows(size=window_size))
            | 'Estimate Targets - {}'.format(inference_type) >> beam.FlatMap(lambda messages: estimate(messages, inference_type))
            | 'Write to BigQuery' >> beam.io.WriteToBigQuery(project=project,
                                                             dataset=bq_dataset,
                                                             table=bq_table
                                                             )
    )

    pipeline.run()
 def expand(self, input):
     return (input
             | WindowInto(FixedWindows(self.duration))
             | ExtractAndSumScore("team"))
Пример #29
0
 def expand(self, pcoll):
     return (
         pcoll
         # Bind window info to each element using element timestamp (or publish time).
         | 'window' >> beam.WindowInto(FixedWindows(self.window_size)))
Пример #30
0
    def test_windowing(self):
        test_stream = (TestStream()
                       .advance_watermark_to(0)
                       .add_elements(['a', 'b', 'c'])
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_watermark_to(5)
                       .add_elements(['1', '2', '3'])
                       .advance_processing_time(1)
                       .advance_watermark_to(6)
                       .advance_processing_time(1)
                       .advance_watermark_to(7)
                       .advance_processing_time(1)
                       .advance_watermark_to(8)
                       .advance_processing_time(1)
                       .advance_watermark_to(9)
                       .advance_processing_time(1)
                       .advance_watermark_to(10)
                       .advance_processing_time(1)
                       .advance_watermark_to(11)
                       .advance_processing_time(1)
                       .advance_watermark_to(12)
                       .advance_processing_time(1)
                       .advance_watermark_to(13)
                       .advance_processing_time(1)
                       .advance_watermark_to(14)
                       .advance_processing_time(1)
                       .advance_watermark_to(15)
                       .advance_processing_time(1)
                       )  # yapf: disable

        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        records = (p
                   | test_stream
                   | 'letter windows' >> beam.WindowInto(
                       FixedWindows(5),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | 'letter with key' >> beam.Map(lambda x: ('k', x))
                   | 'letter gbk' >> beam.GroupByKey()
                   | ReverseTestStream(sample_resolution_sec=1,
                                       output_tag=None))

        assert_that(
            records,
            equal_to_per_window({
                beam.window.GlobalWindow(): [
                    [ProcessingTimeEvent(5),
                     WatermarkEvent(4999998)],
                    [
                        ElementEvent([
                            TimestampedValue(('k', ['a', 'b', 'c']), 4.999999)
                        ])
                    ],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(5000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(6000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(7000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(8000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(9000000)],
                    [
                        ElementEvent([
                            TimestampedValue(('k', ['1', '2', '3']), 9.999999)
                        ])
                    ],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(10000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(11000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(12000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(13000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(14000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(15000000)],
                ],
            }))

        p.run()