def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        #
        # In the case of dynamic sharding, however, we use a default trigger since
        # the transform performs sharding also batches elements to avoid generating
        # too many tiny files. User trigger is applied right after writes to limit
        # the number of load jobs.
        if self.is_streaming_pipeline and not self.with_auto_sharding:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())
示例#2
0
文件: query10.py 项目: mahak/beam
def load(events, metadata=None, pipeline_options=None):
  return (
      events
      | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
      # trigger fires when each sub-triger (executed in order) fires
      # repeatedly 1. after at least maxLogEvents in pane
      #            2. or finally when watermark pass the end of window
      # Repeatedly 1. after at least maxLogEvents in pane
      #            2. or processing time pass the first element in pane + delay
      | 'query10_fix_window' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          trigger=trigger.AfterEach(
              trigger.OrFinally(
                  trigger.Repeatedly(
                      trigger.AfterCount(metadata.get('max_log_events'))),
                  trigger.AfterWatermark()),
              trigger.Repeatedly(
                  trigger.AfterAny(
                      trigger.AfterCount(metadata.get('max_log_events')),
                      trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          # Use a 1 day allowed lateness so that any forgotten hold will stall
          # the pipeline for that period and be very noticeable.
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk' >> beam.GroupByKey()
      | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
      | 'query10_window_log_files' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk_2' >> beam.GroupByKey()
      | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
示例#3
0
    def test_on_pane_watermark_hold_no_pipeline_stall(self):
        """A regression test added for
    https://issues.apache.org/jira/browse/BEAM-10054."""
        START_TIMESTAMP = 1534842000

        test_stream = TestStream()
        test_stream.add_elements(['a'])
        test_stream.advance_processing_time(START_TIMESTAMP + 1)
        test_stream.advance_watermark_to(START_TIMESTAMP + 1)
        test_stream.add_elements(['b'])
        test_stream.advance_processing_time(START_TIMESTAMP + 2)
        test_stream.advance_watermark_to(START_TIMESTAMP + 2)

        with TestPipeline(options=PipelineOptions(['--streaming'])) as p:
            # pylint: disable=expression-not-assigned
            (p
             | 'TestStream' >> test_stream
             | 'timestamp' >> beam.Map(
                 lambda x: beam.window.TimestampedValue(x, START_TIMESTAMP))
             | 'kv' >> beam.Map(lambda x: (x, x))
             | 'window_1m' >> beam.WindowInto(
                 beam.window.FixedWindows(60),
                 trigger=trigger.AfterAny(trigger.AfterProcessingTime(3600),
                                          trigger.AfterWatermark()),
                 accumulation_mode=trigger.AccumulationMode.DISCARDING)
             | 'group_by_key' >> beam.GroupByKey()
             | 'filter' >> beam.Map(lambda x: x))
示例#4
0
    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        if self.is_streaming_pipeline:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())
示例#5
0
def run(argv=None):
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument(
                '--input',
                default='projects/notbanana-7f869/topics/rsvps_source')
            parser.add_argument(
                '--output',
                default='projects/notbanana-7f869/topics/rsvps_out')

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'notbanana-7f869'
    google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging'
    google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp'
    google_cloud_options.job_name = 'demo-job'
    """
    -> Run the pipeline on the Cloud Dataflow runner.
    $ python pipelines/main.py --setup_file path/to/setup.py
    """
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        my_options = options.view_as(MyOptions)
        input_topic = my_options.input
        output_topic = my_options.output
        """
        -> Consumes/collects events sent by the input Pub/Sub topic.
        @: id_label argument is a unique identifier used by the pipeline to
        deduplicate events : Exactly-once semantic.
        """
        inputs = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic=input_topic,
                            # id_label='event_id'
                    ).with_output_types(six.binary_type)
             | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8'))
             | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element))
             | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn()))
        """ 
        -> Outputs the total number of events globally processed by the pipeline.
        Triggering early results from the window every X seconds (processing time trigger)
        or triggering when the current pane has collected at least N elements (data-driven trigger)
        Values used are for testing purposes.
        """
        (inputs
         | 'Apply Global Window' >> beam.WindowInto(
             beam.window.GlobalWindows(),
             trigger=trigger.Repeatedly(
                 trigger.AfterAny(
                     trigger.AfterCount(2),
                     # AfterProcessingTime is experimental.
                     # Not implemented yet.
                     trigger.AfterProcessingTime(30))),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Count events globally' >> beam.CombineGlobally(
             beam.combiners.CountCombineFn()).without_defaults()
         | 'Publish %s' % 'Events' >> WriteToPubSub(
             topic=output_topic, category=Category.GLOBAL_EVENTS))
        """
        -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. 
        Values used are for testing purposes.
        NB: Using a custom TopFn that will deduplicate k/v pairs
        when using an accumulation strategy: SO - 56616576 @guillem-xercavins
        """
        (inputs
         | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto(
             beam.window.FixedWindows(size=10 * 60),
             trigger=trigger.Repeatedly(trigger.AfterCount(5)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | beam.Map(lambda element: element['group'])
         | beam.ParDo(PairTopicWithOneFn())
         | beam.CombinePerKey(sum)
         | 'Top 10 Topics' >> beam.CombineGlobally(
             TopDistinctFn(
                 n=10, compare=lambda a, b: a[1] < b[1])).without_defaults()
         | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn())
         | 'Publish %s' % 'Topics' >> WriteToPubSub(
             topic=output_topic, category=Category.HOT_TOPICS))