示例#1
0
文件: query10.py 项目: mahak/beam
def load(events, metadata=None, pipeline_options=None):
  return (
      events
      | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
      # trigger fires when each sub-triger (executed in order) fires
      # repeatedly 1. after at least maxLogEvents in pane
      #            2. or finally when watermark pass the end of window
      # Repeatedly 1. after at least maxLogEvents in pane
      #            2. or processing time pass the first element in pane + delay
      | 'query10_fix_window' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          trigger=trigger.AfterEach(
              trigger.OrFinally(
                  trigger.Repeatedly(
                      trigger.AfterCount(metadata.get('max_log_events'))),
                  trigger.AfterWatermark()),
              trigger.Repeatedly(
                  trigger.AfterAny(
                      trigger.AfterCount(metadata.get('max_log_events')),
                      trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          # Use a 1 day allowed lateness so that any forgotten hold will stall
          # the pipeline for that period and be very noticeable.
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk' >> beam.GroupByKey()
      | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
      | 'query10_window_log_files' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk_2' >> beam.GroupByKey()
      | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
示例#2
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        # TODO(BEAM-9322): Remove use of this experiment.
        # This flag is only necessary when using the multi-output TestStream b/c
        # it relies on using the PCollection output tags as the PCollection output
        # ids.
        p = TestPipeline(additional_pipeline_args=[
            '--experiments=' + 'passthrough_pcollection_output_ids'
        ])

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
示例#3
0
 def test_different_fixed_windows(self):
   self.run_windowed_side_inputs([1, 2, 11, 21, 31],
                                 window.FixedWindows(10),
                                 window.FixedWindows(20),
                                 expected=[(1, [1, 2, 11]), (2, [1, 2, 11]),
                                           (11, [1, 2, 11]), (21, [21, 31]),
                                           (31, [21, 31])])
示例#4
0
  def test_pardo_side_inputs(self):
    def cross_product(elem, sides):
      for side in sides:
        yield elem, side
    with self.create_pipeline() as p:
      main = p | 'main' >> beam.Create(['a', 'b', 'c'])
      side = p | 'side' >> beam.Create(['x', 'y'])
      assert_that(main | beam.FlatMap(cross_product, beam.pvalue.AsList(side)),
                  equal_to([('a', 'x'), ('b', 'x'), ('c', 'x'),
                            ('a', 'y'), ('b', 'y'), ('c', 'y')]))

      # Now with some windowing.
      pcoll = p | beam.Create(range(10)) | beam.Map(
          lambda t: window.TimestampedValue(t, t))
      # Intentionally choosing non-aligned windows to highlight the transition.
      main = pcoll | 'WindowMain' >> beam.WindowInto(window.FixedWindows(5))
      side = pcoll | 'WindowSide' >> beam.WindowInto(window.FixedWindows(7))
      res = main | beam.Map(lambda x, s: (x, sorted(s)),
                            beam.pvalue.AsList(side))
      assert_that(
          res,
          equal_to([
              # The window [0, 5) maps to the window [0, 7).
              (0, range(7)),
              (1, range(7)),
              (2, range(7)),
              (3, range(7)),
              (4, range(7)),
              # The window [5, 10) maps to the window [7, 14).
              (5, range(7, 10)),
              (6, range(7, 10)),
              (7, range(7, 10)),
              (8, range(7, 10)),
              (9, range(7, 10))]),
          label='windowed')
示例#5
0
 def test_pardo_windowed_side_inputs(self):
     with self.create_pipeline() as p:
         # Now with some windowing.
         pcoll = p | beam.Create(list(
             range(10))) | beam.Map(lambda t: window.TimestampedValue(t, t))
         # Intentionally choosing non-aligned windows to highlight the transition.
         main = pcoll | 'WindowMain' >> beam.WindowInto(
             window.FixedWindows(5))
         side = pcoll | 'WindowSide' >> beam.WindowInto(
             window.FixedWindows(7))
         res = main | beam.Map(lambda x, s:
                               (x, sorted(s)), beam.pvalue.AsList(side))
         assert_that(
             res,
             equal_to([
                 # The window [0, 5) maps to the window [0, 7).
                 (0, list(range(7))),
                 (1, list(range(7))),
                 (2, list(range(7))),
                 (3, list(range(7))),
                 (4, list(range(7))),
                 # The window [5, 10) maps to the window [7, 14).
                 (5, list(range(7, 10))),
                 (6, list(range(7, 10))),
                 (7, list(range(7, 10))),
                 (8, list(range(7, 10))),
                 (9, list(range(7, 10)))
             ]),
             label='windowed')
示例#6
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
  def test_basic_execution_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment(
        'passthrough_pcollection_output_ids')
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    test_stream = (p | TestStream()
        .advance_watermark_to(12, tag='side')
        .add_elements([window.TimestampedValue('s1', 10)], tag='side')
        .advance_watermark_to(20, tag='side')
        .add_elements([window.TimestampedValue('s2', 20)], tag='side')

        .advance_watermark_to(9, tag='main')
        .add_elements(['a1', 'a2', 'a3', 'a4'], tag='main')
        .add_elements(['b'], tag='main')
        .advance_watermark_to(18, tag='main')
        .add_elements('c', tag='main')
        ) # yapf: disable

    main_stream = (
        test_stream['main']
        | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1)))

    side_stream = (
        test_stream['side']
        | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3)))

    class RecordFn(beam.DoFn):
      def process(
          self,
          elm=beam.DoFn.ElementParam,
          ts=beam.DoFn.TimestampParam,
          side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (
        main_stream  # pylint: disable=unused-variable
        | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(9, 10): [
            ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']),
            ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']),
            ('b', Timestamp(9), ['s1'])
        ],
        window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
示例#8
0
  def test_basic_execution_sideinputs_fixed_windows(self):

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def recorded_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(9)
                   .add_elements(['a1', 'a2', 'a3', 'a4'])
                   .add_elements(['b'])
                   .advance_watermark_to(18)
                   .add_elements('c')
                   | 'main windowInto' >> beam.WindowInto(
                       window.FixedWindows(1))
                  )
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .advance_watermark_to(12)
                   .add_elements([window.TimestampedValue('s1', 10)])
                   .advance_watermark_to(20)
                   .add_elements([window.TimestampedValue('s2', 20)])
                   | 'side windowInto' >> beam.WindowInto(
                       window.FixedWindows(3))
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream     # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))
               | beam.Map(recorded_elements))
    p.run()

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('a1', Timestamp(9), ['s1']),
                      ('a2', Timestamp(9), ['s1']),
                      ('a3', Timestamp(9), ['s1']),
                      ('a4', Timestamp(9), ['s1']),
                      ('b', Timestamp(9), ['s1']),
                      ('c', Timestamp(18), ['s2'])], result)
示例#9
0
def run(argv=None):
    """Build and run the pipeline"""
    parser = argparse.ArgumentParser()
    parser.add_argument("--topic", type=str, help='Pub/Sub topic to read from')
    parser.add_argument("--output_bucket", help=('Output local filemane'))
    parser.add_argument('--output_bigquery',
                        default='IoTData.engine',
                        help=('Output BigQuery table: '
                              'PROJECT:DATASET.TABLE '
                              'or DATASET.TABLE.'))
    parser.add_argument('--output_bigquery_avg',
                        default='DeviceData.engine_avr',
                        help=('Output BigQuery table for averages: '
                              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=options)
    pubsub_stream = (
        p | 'Read from PubSub' >> beam.io.ReadFromPubSub(topic=args.topic))
    records = (pubsub_stream
               | 'Parse JSON to Dict' >> beam.Map(lambda e: json.loads(e))
               | 'Add timestamp' >> beam.ParDo(AddTimestampToDict()))

    # stream to BigQuery
    (records | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        args.output_bigquery,
        schema=Schema.get_bigquery_schema(),
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    # averages
    (records | "Window for avg" >> beam.WindowInto(window.FixedWindows(60))
     | 'Add deviceId Key' >> beam.ParDo(AddKeyToDict())
     | 'Group by Key' >> beam.GroupByKey()
     | 'Count average' >> beam.ParDo(CountAverages())
     | 'Write Avg to BigQuery' >> beam.io.WriteToBigQuery(
         args.output_bigquery_avg,
         schema=Schema.get_bigquery_avg_schema(),
         create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=BigQueryDisposition.WRITE_APPEND))

    (records | "Window for bucket" >> beam.WindowInto(window.FixedWindows(60))
     | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
     | "Group by Dummy Key" >> beam.GroupByKey()
     | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
     | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(args.output_bucket)))

    result = p.run()
    result.wait_until_finish()
示例#10
0
 def test_windowed_singleton(self):
   self.run_windowed_side_inputs(
       [1, 2, 11],
       window.FixedWindows(10),
       side_input_type=beam.pvalue.AsSingleton,
       combine_fn=sum,
       expected=[(1, 3), (2, 3), (11, 11)])
示例#11
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_topic',
                        required=True,
                        help=('Input PubSub topic of the form '
                              '"projects/<PROJECT>/topics/<TOPIC>".'))
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    known_args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=options) as p:

        # Read from PubSub into a PCollection.
        lines = p | beam.io.ReadStringsFromPubSub(known_args.input_topic)

        # Capitalize the characters in each line.
        transformed = (
            lines
            # Use a pre-defined function that imports the re package.
            | 'Split' >> (beam.FlatMap(split_fn).with_output_types(unicode))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | beam.WindowInto(window.FixedWindows(15, 0))
            | 'Group' >> beam.GroupByKey()
            | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
            | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup))

        # Write to PubSub.
        # pylint: disable=expression-not-assigned
        transformed | beam.io.WriteStringsToPubSub(known_args.output_topic)
示例#12
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    offer_stat_pipeline_options = pipeline_options.view_as(
        OfferStatPipelineOptions)

    p = beam.Pipeline(options=pipeline_options)

    p | "Read account offer from PS" >> beam.io.ReadFromPubSub(topic=offer_stat_pipeline_options.account_offers_topic) \
    | "Parse message" >> beam.ParDo(PubsubMessageParser()) \
    | "Windowing" >> beam.WindowInto(window.FixedWindows(60),
                                     trigger=trigger.AfterWatermark(early=trigger.AfterProcessingTime(20)),
                                     accumulation_mode=AccumulationMode.ACCUMULATING) \
    | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer)) \
    | beam.GroupByKey() \
    | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount()) \
    | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()) \
    | 'Writing offers to BQ' >> beam.io.WriteToBigQuery(table=offer_stat_pipeline_options.offer_stat_bq_table,
                                                        create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                                                        write_disposition=BigQueryDisposition.WRITE_APPEND,
                                                        schema=OFFER_STAT_BQ_SCHEMA)

    result = p.run()
    result.wait_until_finish()
示例#13
0
文件: sync.py 项目: sambvfx/rillbeam
def main_without_pubsub(options):
    from rillbeam.transforms import SleepFn

    with beam.Pipeline(options=options) as pipe:

        # FIXME: still can't "fake" timestamp data like we get from pubsub...
        graph = (
            pipe
            | 'start' >> beam.Create([(k, k) for k in range(5)])
            # The purpose of the WindowInto transform is to establish a
            # FixedWindows windowing function for the PCollection.
            # It does not bucket elements into windows since the timestamps
            # from Create are not spaced 5 ms apart and very likely they all
            # fall into the same window.
            | 'w' >> beam.WindowInto(window.FixedWindows(5))
            # Generate timestamped values using the values as timestamps.
            # Now there are values 5 ms apart and since Map propagates the
            # windowing function from input to output the output PCollection
            # will have elements falling into different 5ms windows.
            | beam.Map(lambda x_t2: window.TimestampedValue(x_t2[0], x_t2[1])))

        b1 = (graph
              | 'AsInt' >> beam.Map(lambda x: int(x))
              | 'LogInt' >> Log())

        b2 = (graph
              | 'AsStr' >> beam.Map(lambda x: str(x))
              | 'LogStr' >> Log())

        b3 = (b1
              | 'Sleep' >> beam.ParDo(SleepFn(), duration=0.2)
              | 'AsFloat' >> beam.Map(lambda x: float(x))
              | 'LogFloat' >> Log())

        ((b1, b2, b3) | Sync() | 'SyncLog' >> Log())
def run(argv=None):
  """Build and run the pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_topic', required=True,
      help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
  parser.add_argument(
      '--output_topic', required=True,
      help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = beam.Pipeline(argv=pipeline_args)

  # Read the text file[pattern] into a PCollection.
  lines = p | beam.io.Read(
      'read', beam.io.PubSubSource(known_args.input_topic))

  # Capitalize the characters in each line.
  transformed = (lines
                 | 'Split' >> (
                     beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                     .with_output_types(unicode))
                 | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                 | beam.WindowInto(window.FixedWindows(15, 0))
                 | 'Group' >> beam.GroupByKey()
                 | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
                 | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup))

  # Write to PubSub.
  # pylint: disable=expression-not-assigned
  transformed | beam.io.Write(
      'pubsub_write', beam.io.PubSubSink(known_args.output_topic))

  p.run().wait_until_finish()
示例#15
0
    def test_timer_output_timestamp_and_window(self):
        class TimerEmittingStatefulDoFn(DoFn):
            EMIT_TIMER_1 = TimerSpec('emit1', TimeDomain.WATERMARK)

            def process(self, element, timer1=DoFn.TimerParam(EMIT_TIMER_1)):
                timer1.set(10)

            @on_timer(EMIT_TIMER_1)
            def emit_callback_1(self,
                                window=DoFn.WindowParam,
                                ts=DoFn.TimestampParam,
                                key=DoFn.KeyParam):
                yield ('timer1-{key}'.format(key=key), int(ts),
                       int(window.start), int(window.end))

        pipeline_options = PipelineOptions()
        with TestPipeline(options=pipeline_options) as p:
            test_stream = (TestStream().advance_watermark_to(10).add_elements(
                [1]))
            (p
             | test_stream
             | beam.Map(lambda x: ('mykey', x))
             | "window_into" >> beam.WindowInto(
                 window.FixedWindows(5),
                 accumulation_mode=trigger.AccumulationMode.DISCARDING)
             | beam.ParDo(TimerEmittingStatefulDoFn())
             | beam.ParDo(self.record_dofn()))

        self.assertEqual([('timer1-mykey', 10, 10, 15)],
                         sorted(StatefulDoFnOnDirectRunnerTest.all_records))
示例#16
0
    def test_stateful_set_state_clean_portably(self):
        class SetStateClearingStatefulDoFn(beam.DoFn):

            SET_STATE = SetStateSpec('buffer', VarIntCoder())
            EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK)

            def process(self,
                        element,
                        set_state=beam.DoFn.StateParam(SET_STATE),
                        emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)):
                _, value = element
                set_state.add(value)

                all_elements = [element for element in set_state.read()]

                if len(all_elements) == 5:
                    set_state.clear()
                    set_state.add(100)
                    emit_timer.set(1)

            @on_timer(EMIT_TIMER)
            def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)):
                yield sorted(set_state.read())

        with TestPipeline() as p:
            values = p | beam.Create([('key', 1), ('key', 2), ('key', 3),
                                      ('key', 4), ('key', 5)])
            actual_values = (values
                             |
                             beam.Map(lambda t: window.TimestampedValue(t, 1))
                             | beam.WindowInto(window.FixedWindows(1))
                             | beam.ParDo(SetStateClearingStatefulDoFn()))

            assert_that(actual_values, equal_to([[100]]))
示例#17
0
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      required=True,
                      help='Input Pub/Sub subscription to read from.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output BigQuery table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  (p | 'read' >> ReadFromPubSub(subscription=known_args.input)
     | 'extract words' >> beam.FlatMap(extract_words)
     | 'transform to kv' >> beam.Map(lambda x: (x,1))
     | 'window per minute' >> beam.WindowInto(
                                window.FixedWindows(5),
                                trigger=trigger.AfterProcessingTime(delay=10),
                                accumulation_mode=trigger.AccumulationMode.DISCARDING)
     | 'group by words' >> beam.GroupByKey()
     | 'count ones' >> beam.Map(count_ones)
     | 'format for bq' >> beam.Map(format_for_bigquery)
     | 'write to bigquery' >> WriteToBigQuery(table=known_args.output))

  result = p.run()
  result.wait_until_finish()
示例#18
0
 def expand(self, p):
     return (p
             |
             'window' >> beam.WindowInto(window.FixedWindows(self.duration))
             | 'filter_spammers' >> beam.ParDo(FilterSpammers(),
                                               spammers=self.spammers)
             | 'extract_team_score' >> ExtractAndSumScore('team'))
示例#19
0
    def test_combiner_latest(self):
        """Test TimestampCombiner with LATEST."""
        options = PipelineOptions(streaming=True)
        with TestPipeline(options=options) as p:
            result = (
                p
                | TestStream().add_elements([
                    window.TimestampedValue(('k', 100), 2)
                ]).add_elements([window.TimestampedValue(
                    ('k', 400), 7)]).advance_watermark_to_infinity()
                | beam.WindowInto(
                    window.FixedWindows(10),
                    timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST)
                | beam.CombinePerKey(sum))

            records = (
                result
                | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))

            # All the KV pairs are applied GBK using LATEST timestamp for
            # the same key.
            expected_window_to_elements = {
                window.IntervalWindow(0, 10): [
                    (('k', 500), Timestamp(7)),
                ],
            }

            assert_that(records,
                        equal_to_per_window(expected_window_to_elements),
                        use_global_window=False,
                        label='assert per window')
示例#20
0
 def test_fixed_global_window(self):
     self.run_windowed_side_inputs([1, 2, 11],
                                   window.FixedWindows(10),
                                   window.GlobalWindows(),
                                   expected=[(1, [1, 2, 11]),
                                             (2, [1, 2, 11]),
                                             (11, [1, 2, 11])])
示例#21
0
 def expand(self, pcoll):
     return (
         pcoll
         # Assigns window info to each Pub/Sub message based on its publish timestamp.
         | "window_into" >> beam.WindowInto(
             window.FixedWindows(self.window_size))
         | "parse_message" >> beam.Map(ProcessMessages.transform))
示例#22
0
def run(argv=None):
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_subscription',
        required=True,
        help=
        'Input PubSub subscription of the form "projects/<project>/subscriptions/<subscription_name>".'
    )
    parser.add_argument(
        '--output_table',
        required=True,
        help=
        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
         'or DATASET.TABLE.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    with beam.Pipeline(argv=pipeline_args) as p:

        # Read the text from PubSub messages.
        lines = p | beam.io.ReadFromPubSub(
            subscription=known_args.input_subscription)
        transformed = (lines
                       | 'Split' >> (beam.FlatMap(find_msg))
                       | 'window' >> beam.WindowInto(window.FixedWindows(60))
                       | 'append' >> beam.CombineGlobally(
                           ToListCombineFn()).without_defaults()
                       | 'Format' >> beam.ParDo(FormDoFn()))

        transformed | 'Write' >> beam.io.WriteToBigQuery(
            known_args.output_table)
示例#23
0
 def test_serialize_windowing_strategy(self):
     # This just tests the basic path; more complete tests
     # are in window_test.py.
     strategy = Windowing(window.FixedWindows(10))
     self.assertEqual(
         strategy,
         DataflowRunner.deserialize_windowing_strategy(
             DataflowRunner.serialize_windowing_strategy(strategy, None)))
示例#24
0
 def apply_window(self, data):
     """Function used to apply the window to the data.
     Currently this is FIXED since OHLCV data is calculated using a fixed window.
     :param data: PCollection being processed
     :return: PCollection with applied window depending on window size.
     """
     return data | 'Resampler - Divide data to windows' >> beam.WindowInto(
         window.FixedWindows(self.window_size))
示例#25
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('input_topic',
                        type=str,
                        help="Input Pub/Sub topic name.")
    parser.add_argument(
        'output_table',
        type=str,
        help="Output BigQuery table name. Example: project.db.name")
    parser.add_argument('--model_project',
                        type=str,
                        help="Google Project ID with model.")
    parser.add_argument('--model_name',
                        type=str,
                        help="Name of the Google AI Platform model name.")
    parser.add_argument('--model_region',
                        type=str,
                        help="AI Platform region name.")
    parser.add_argument('--model_version',
                        type=str,
                        help="AI Platform model version.")

    known_args, pipeline_args = parser.parse_known_args(argv)

    _topic_comp = known_args.input_topic.split('/')
    if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[
            2] != 'topics':
        raise ValueError("Table topic name has inappropriate format.")

    if len(known_args.output_table.split('.')) != 2:
        raise ValueError("Table name has inappropriate format.")

    inf_args = [
        known_args.model_project, known_args.model_name,
        known_args.model_region, known_args.model_version
    ]
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = Pipeline(options=options)
    _ = (p | 'read from pub/sub' >> ReadFromPubSub(
        known_args.input_topic).with_output_types(bytes)
         | 'windowing' >> WindowInto(window.FixedWindows(10, 0))
         | 'convert to dict' >> Map(json.loads)
         | 'pre processing' >> PreProcessing()
         | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args))
         | 'format message' >> Map(formatter)
         | 'write to BQ' >> WriteToBigQuery(
             table=known_args.output_table,
             schema=build_bq_schema(),
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_APPEND))
    if os.environ.get('DEPLOY'):
        p.run(
        )  # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running.
    else:
        p.run().wait_until_finish()
示例#26
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        lines = p | beam.io.ReadFromPubSub(
            subscription=known_args.input_subscription)
    else:
        lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (
        lines
        | 'split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(six.text_type))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | beam.WindowInto(window.FixedWindows(15, 0))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    output | beam.io.WriteToPubSub(known_args.output_topic)

    result = p.run()
    result.wait_until_finish()
示例#27
0
def run(argv=None, save_main_session=True):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic',
      required=True,
      help=(
          'Output PubSub topic of the form '
          '"projects/<PROJECT>/topics/<TOPIC>".'))
  parser.add_argument(
      '--input_topic',
      required=True,
      help=(
          'Input PubSub subscription of the form '
          '"projects/<PROJECT>/topics/<TOPIC>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  pipeline_options.view_as(StandardOptions).streaming = True

  with beam.Pipeline(options=pipeline_options) as p:

    messages = (
        p
        | beam.io.ReadFromPubSub(topic=known_args.input_topic).
        with_output_types(bytes))

    lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))

    # Count the occurrences of each word.
    def count_ones(word_ones):
      (word, ones) = word_ones
      return (word, sum(ones))

    counts = (
        lines
        | 'split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | beam.WindowInto(window.FixedWindows(15, 0))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %d' % (word, count)

    output = (
        counts
        | 'format' >> beam.Map(format_result)
        | 'encode' >>
        beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes))

    # Write to PubSub.
    output | beam.io.WriteToPubSub(known_args.output_topic)
示例#28
0
 def expand(self, pcoll):
     return (
         pcoll
         | 'Add Timestamps' >>
         beam.Map(lambda x: beam.window.TimestampedValue(x, time.time()))
         | "Window into Fixed Intervals" >> beam.WindowInto(
             window.FixedWindows(self.window_size))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output', required=True,
      help=('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
       'or DATASET.TABLE.'))
  parser.add_argument(
      '--input_subscription', required=True,
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  p = beam.Pipeline(options=pipeline_options)

  # Read from PubSub into a PCollection.
  # messages = (p
  #               | beam.io.ReadFromPubSub(
  #                   subscription=known_args.input_subscription)
  #               .with_output_types(bytes))

  messages = (p
                | beam.io.ReadFromText(messages_path))

  lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))

  tweets = lines | 'extract tweets' >> (beam.ParDo(JSONToTweetDoFn()))

  tweets_with_ts = tweets | 'set timestamp' >> beam.ParDo(AddTimestampFn())

  # records = tweets | 'tweets to records' >> (beam.Map(tweet_to_bqrecord.tweet_to_bqrecord))

  def count(element):
    (w, ones) = element
    print (w, sum(ones))
    return (w, sum(ones))

  languages = (tweets_with_ts
                | 'extract language' >> (beam.Map(lambda x: (x.language, 1)))
                | beam.WindowInto(window.FixedWindows(1, 0))
                | 'group' >> beam.GroupByKey()
                | 'count' >> beam.Map(count)
                )

  # records | 'write' >> beam.io.Write(
  #     beam.io.BigQuerySink(
  #         known_args.output,
  #         schema=tweet_schema.table_schema,
  #         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
  #         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

  result = p.run()
  result.wait_until_finish()
示例#30
0
  def _run_pardo_state_timers(self, windowed):
    state_spec = userstate.BagStateSpec('state', beam.coders.StrUtf8Coder())
    timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK)
    elements = list('abcdefgh')
    buffer_size = 3

    class BufferDoFn(beam.DoFn):
      def process(self,
                  kv,
                  ts=beam.DoFn.TimestampParam,
                  timer=beam.DoFn.TimerParam(timer_spec),
                  state=beam.DoFn.StateParam(state_spec)):
        _, element = kv
        state.add(element)
        buffer = state.read()
        # For real use, we'd keep track of this size separately.
        if len(list(buffer)) >= 3:
          state.clear()
          yield buffer
        else:
          timer.set(ts + 1)

      @userstate.on_timer(timer_spec)
      def process_timer(self, state=beam.DoFn.StateParam(state_spec)):
        buffer = state.read()
        state.clear()
        yield buffer

    def is_buffered_correctly(actual):
      # Pickling self in the closure for asserts gives errors (only on jenkins).
      self = FnApiRunnerTest('__init__')
      # Acutal should be a grouping of the inputs into batches of size
      # at most buffer_size, but the actual batching is nondeterministic
      # based on ordering and trigger firing timing.
      self.assertEqual(sorted(sum((list(b) for b in actual), [])), elements)
      self.assertEqual(max(len(list(buffer)) for buffer in actual), buffer_size)
      if windowed:
        # Elements were assigned to windows based on their parity.
        # Assert that each grouping consists of elements belonging to the
        # same window to ensure states and timers were properly partitioned.
        for b in actual:
          parity = set(ord(e) % 2 for e in b)
          self.assertEqual(1, len(parity), b)

    with self.create_pipeline() as p:
      actual = (
          p
          | beam.Create(elements)
          # Send even and odd elements to different windows.
          | beam.Map(lambda e: window.TimestampedValue(e, ord(e) % 2))
          | beam.WindowInto(window.FixedWindows(1) if windowed
                            else window.GlobalWindows())
          | beam.Map(lambda x: ('key', x))
          | beam.ParDo(BufferDoFn()))

      assert_that(actual, is_buffered_correctly)