Python AfterProcessingTime示例，apache_beam.transforms.trigger.AfterProcessingTime Python示例

示例#1

0

显示文件

文件： bigquery_file_loads.py 项目： ashishramtri/Apache-Beam

    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        #
        # In the case of dynamic sharding, however, we use a default trigger since
        # the transform performs sharding also batches elements to avoid generating
        # too many tiny files. User trigger is applied right after writes to limit
        # the number of load jobs.
        if self.is_streaming_pipeline and not self.with_auto_sharding:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())

示例#2

0

显示文件

    def test_gbk_execution_after_processing_trigger_fired(self):
        """Advance TestClock to (X + delta) and see the pipeline does finish."""
        # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired
        # Advance TestClock to (X + delta) and see the pipeline does finish
        # Possibly to the framework trigger_transcripts.yaml

        test_stream = (TestStream().advance_watermark_to(10).add_elements([
            'a'
        ]).advance_processing_time(5.1).advance_watermark_to_infinity())

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        records = (p
                   | test_stream
                   | beam.WindowInto(
                       beam.window.FixedWindows(15),
                       trigger=trigger.AfterProcessingTime(5),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | beam.Map(lambda x: ('k', x))
                   | beam.GroupByKey())

        # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
        # respect the TimestampCombiner.  The test below should also verify the
        # timestamps of the outputted elements once this is implemented.

        expected_window_to_elements = {
            window.IntervalWindow(0, 15): [('k', ['a'])],
        }
        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    label='assert per window')

        p.run()

示例#3

0

显示文件

def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      required=True,
                      help='Input Pub/Sub subscription to read from.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output BigQuery table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  (p | 'read' >> ReadFromPubSub(subscription=known_args.input)
     | 'extract words' >> beam.FlatMap(extract_words)
     | 'transform to kv' >> beam.Map(lambda x: (x,1))
     | 'window per minute' >> beam.WindowInto(
                                window.FixedWindows(5),
                                trigger=trigger.AfterProcessingTime(delay=10),
                                accumulation_mode=trigger.AccumulationMode.DISCARDING)
     | 'group by words' >> beam.GroupByKey()
     | 'count ones' >> beam.Map(count_ones)
     | 'format for bq' >> beam.Map(format_for_bigquery)
     | 'write to bigquery' >> WriteToBigQuery(table=known_args.output))

  result = p.run()
  result.wait_until_finish()

示例#4

0

显示文件

文件： query10.py 项目： mahak/beam

def load(events, metadata=None, pipeline_options=None):
  return (
      events
      | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
      # trigger fires when each sub-triger (executed in order) fires
      # repeatedly 1. after at least maxLogEvents in pane
      #            2. or finally when watermark pass the end of window
      # Repeatedly 1. after at least maxLogEvents in pane
      #            2. or processing time pass the first element in pane + delay
      | 'query10_fix_window' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          trigger=trigger.AfterEach(
              trigger.OrFinally(
                  trigger.Repeatedly(
                      trigger.AfterCount(metadata.get('max_log_events'))),
                  trigger.AfterWatermark()),
              trigger.Repeatedly(
                  trigger.AfterAny(
                      trigger.AfterCount(metadata.get('max_log_events')),
                      trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          # Use a 1 day allowed lateness so that any forgotten hold will stall
          # the pipeline for that period and be very noticeable.
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk' >> beam.GroupByKey()
      | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
      | 'query10_window_log_files' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk_2' >> beam.GroupByKey()
      | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))

示例#5

0

显示文件

    def test_on_pane_watermark_hold_no_pipeline_stall(self):
        """A regression test added for
    https://issues.apache.org/jira/browse/BEAM-10054."""
        START_TIMESTAMP = 1534842000

        test_stream = TestStream()
        test_stream.add_elements(['a'])
        test_stream.advance_processing_time(START_TIMESTAMP + 1)
        test_stream.advance_watermark_to(START_TIMESTAMP + 1)
        test_stream.add_elements(['b'])
        test_stream.advance_processing_time(START_TIMESTAMP + 2)
        test_stream.advance_watermark_to(START_TIMESTAMP + 2)

        with TestPipeline(options=PipelineOptions(['--streaming'])) as p:
            # pylint: disable=expression-not-assigned
            (p
             | 'TestStream' >> test_stream
             | 'timestamp' >> beam.Map(
                 lambda x: beam.window.TimestampedValue(x, START_TIMESTAMP))
             | 'kv' >> beam.Map(lambda x: (x, x))
             | 'window_1m' >> beam.WindowInto(
                 beam.window.FixedWindows(60),
                 trigger=trigger.AfterAny(trigger.AfterProcessingTime(3600),
                                          trigger.AfterWatermark()),
                 accumulation_mode=trigger.AccumulationMode.DISCARDING)
             | 'group_by_key' >> beam.GroupByKey()
             | 'filter' >> beam.Map(lambda x: x))

示例#6

0

显示文件

def run(argv=None):
    parser = argparse.ArgumentParser()

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    offer_stat_pipeline_options = pipeline_options.view_as(
        OfferStatPipelineOptions)

    p = beam.Pipeline(options=pipeline_options)

    p | "Read account offer from PS" >> beam.io.ReadFromPubSub(topic=offer_stat_pipeline_options.account_offers_topic) \
    | "Parse message" >> beam.ParDo(PubsubMessageParser()) \
    | "Windowing" >> beam.WindowInto(window.FixedWindows(60),
                                     trigger=trigger.AfterWatermark(early=trigger.AfterProcessingTime(20)),
                                     accumulation_mode=AccumulationMode.ACCUMULATING) \
    | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer)) \
    | beam.GroupByKey() \
    | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount()) \
    | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()) \
    | 'Writing offers to BQ' >> beam.io.WriteToBigQuery(table=offer_stat_pipeline_options.offer_stat_bq_table,
                                                        create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                                                        write_disposition=BigQueryDisposition.WRITE_APPEND,
                                                        schema=OFFER_STAT_BQ_SCHEMA)

    result = p.run()
    result.wait_until_finish()

示例#7

0

显示文件

  def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
    outputs = (
        destination_data_kv_pc
        | beam.ParDo(
            WriteRecordsToFile(
                schema=self.schema,
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs).with_outputs(
                WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                WriteRecordsToFile.WRITTEN_FILE_TAG))

    # A PCollection of (destination, file) tuples. It lists files with records,
    # and the destination each file is meant to be imported into.
    destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

    # A PCollection of (destination, record) tuples. These are later sharded,
    # grouped, and all records for each destination-shard is written to files.
    # This PCollection is necessary because not all records can be written into
    # files in ``WriteRecordsToFile``.
    unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

    more_destination_files_kv_pc = (
        unwritten_records_pc
        | beam.ParDo(_ShardDestinations())
        | "GroupShardedRows" >> beam.GroupByKey()
        | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
        | "WriteGroupedRecordsToFile" >> beam.ParDo(
            WriteGroupedRecordsToFile(
                schema=self.schema, file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs))

    # TODO(BEAM-9494): Remove the identity transform. We flatten both
    # PCollection paths and use an identity function to work around a
    # flatten optimization issue where the wrong coder is being used.
    all_destination_file_pairs_pc = (
        (destination_files_kv_pc, more_destination_files_kv_pc)
        | "DestinationFilesUnion" >> beam.Flatten()
        | "IdentityWorkaround" >> beam.Map(lambda x: x))

    if self.is_streaming_pipeline:
      # Apply the user's trigger back before we start triggering load jobs
      all_destination_file_pairs_pc = (
          all_destination_file_pairs_pc
          | "ApplyUserTrigger" >> beam.WindowInto(
              beam.window.GlobalWindows(),
              trigger=trigger.Repeatedly(
                  trigger.AfterAll(
                      trigger.AfterProcessingTime(self.triggering_frequency),
                      trigger.AfterCount(1))),
              accumulation_mode=trigger.AccumulationMode.DISCARDING))
    return all_destination_file_pairs_pc

示例#8

0

显示文件

def run(argv=None):
    # Use Python argparse module to parse custom arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--network')
    parser.add_argument('--input', dest='input', help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to.')
    parser.add_argument('--output_topic',
                        dest='out_topic',
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    parser.add_argument('--input_topic',
                        dest='in_topic',
                        help=('Input PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    known_args, pipeline_args = parser.parse_known_args(argv)
    p_options = PipelineOptions(pipeline_args)
    google_cloud_options = p_options.view_as(GoogleCloudOptions)
    google_cloud_options.region = 'europe-west1'
    google_cloud_options.project = 'smartlive'
    '''google_cloud_options.job_name = 'dataflow-job-{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d%H%M%S")
    )'''
    google_cloud_options.staging_location = 'gs://rim-bucket/binaries'
    google_cloud_options.temp_location = 'gs://rim-bucket/temp'

    p_options.view_as(StandardOptions).runner = 'DirectRunner'
    p_options.view_as(SetupOptions).save_main_session = True
    p_options.view_as(StandardOptions).streaming = True
    p_options.view_as(WorkerOptions).subnetwork = (
        'regions/europe-west1/subnetworks/test')
    p = beam.Pipeline(options=p_options)

    lines = p | 'receive_data' >> beam.io.ReadFromPubSub(
        subscription=known_args.in_topic).with_input_types(str) \
        | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \
        | 'jsonload' >> beam.Map(lambda x: json.loads(x))

    # ----- window fixe + Trigger AfterWatermark + Accumulating mode  ------ #
    (lines | 'window' >> beam.WindowInto(
        window.FixedWindows(10),
        trigger=trigger.AfterProcessingTime(30),
        accumulation_mode=trigger.AccumulationMode.DISCARDING)
     | 'CountGlobally' >> beam.CombineGlobally(
         beam.combiners.CountCombineFn()).without_defaults()
     | 'printnbrarticles' >> beam.ParDo(PrintFn())
     | 'jsondumps' >> beam.Map(lambda x: json.dumps(x))
     | 'encode' >> beam.Map(lambda x: x.encode('utf-8'))
     | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic))

    p.run().wait_until_finish()

示例#9

0

显示文件

文件： bigquery_file_loads.py 项目： ashishramtri/Apache-Beam

 def _maybe_apply_user_trigger(self, destination_file_kv_pc):
     if self.is_streaming_pipeline:
         # Apply the user's trigger back before we start triggering load jobs
         return (destination_file_kv_pc
                 | "ApplyUserTrigger" >> beam.WindowInto(
                     beam.window.GlobalWindows(),
                     trigger=trigger.Repeatedly(
                         trigger.AfterAll(
                             trigger.AfterProcessingTime(
                                 self.triggering_frequency),
                             trigger.AfterCount(1))),
                     accumulation_mode=trigger.AccumulationMode.DISCARDING))
     else:
         return destination_file_kv_pc

示例#10

0

显示文件

    def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
        outputs = (destination_data_kv_pc
                   | beam.ParDo(WriteRecordsToFile(
                       max_files_per_bundle=self.max_files_per_bundle,
                       max_file_size=self.max_file_size,
                       coder=self.coder),
                                file_prefix=file_prefix_pcv).with_outputs(
                                    WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                                    WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        if self.is_streaming_pipeline:
            # Apply the user's trigger back before we start triggering load jobs
            all_destination_file_pairs_pc = (
                all_destination_file_pairs_pc
                | "ApplyUserTrigger" >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(
                        trigger.AfterAll(
                            trigger.AfterProcessingTime(
                                self.triggering_frequency),
                            trigger.AfterCount(1))),
                    accumulation_mode=trigger.AccumulationMode.DISCARDING))
        return all_destination_file_pairs_pc

示例#11

0

显示文件

def load(events, metadata=None):
  return (
      events
      | nexmark_query_util.JustBids()
      | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder)
      # windowing with processing time trigger, currently not supported in batch
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(
              trigger.AfterProcessingTime(metadata.get('window_size_sec'))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=0)
      | 'query12_bid_count' >> beam.combiners.Count.PerElement()
      | 'query12_output' >> beam.Map(
          lambda t: {
              ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1]
          }))

示例#12

0

显示文件

文件： test_stream_test.py 项目： wslulciuc/beam

  def test_gbk_execution_after_processing_trigger_fired(self):
    """Advance TestClock to (X + delta) and see the pipeline does finish."""
    # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired
    # Advance TestClock to (X + delta) and see the pipeline does finish
    # Possibly to the framework trigger_transcripts.yaml

    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a'])
                   .advance_processing_time(5.1))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(
                   beam.window.FixedWindows(15),
                   trigger=trigger.AfterProcessingTime(5),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING
                   )
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed.
    assert_that(records, equal_to([
        ('k', ['a'])]))

    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('k', ['a'])], result)

示例#13

0

显示文件

    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        if self.is_streaming_pipeline:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())

示例#14

0

显示文件

文件： meetuplytics.py 项目： angulartist/meetuplytics

def run(argv=None):
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument(
                '--input',
                default='projects/notbanana-7f869/topics/rsvps_source')
            parser.add_argument(
                '--output',
                default='projects/notbanana-7f869/topics/rsvps_out')

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'notbanana-7f869'
    google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging'
    google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp'
    google_cloud_options.job_name = 'demo-job'
    """
    -> Run the pipeline on the Cloud Dataflow runner.
    $ python pipelines/main.py --setup_file path/to/setup.py
    """
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        my_options = options.view_as(MyOptions)
        input_topic = my_options.input
        output_topic = my_options.output
        """
        -> Consumes/collects events sent by the input Pub/Sub topic.
        @: id_label argument is a unique identifier used by the pipeline to
        deduplicate events : Exactly-once semantic.
        """
        inputs = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic=input_topic,
                            # id_label='event_id'
                    ).with_output_types(six.binary_type)
             | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8'))
             | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element))
             | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn()))
        """ 
        -> Outputs the total number of events globally processed by the pipeline.
        Triggering early results from the window every X seconds (processing time trigger)
        or triggering when the current pane has collected at least N elements (data-driven trigger)
        Values used are for testing purposes.
        """
        (inputs
         | 'Apply Global Window' >> beam.WindowInto(
             beam.window.GlobalWindows(),
             trigger=trigger.Repeatedly(
                 trigger.AfterAny(
                     trigger.AfterCount(2),
                     # AfterProcessingTime is experimental.
                     # Not implemented yet.
                     trigger.AfterProcessingTime(30))),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Count events globally' >> beam.CombineGlobally(
             beam.combiners.CountCombineFn()).without_defaults()
         | 'Publish %s' % 'Events' >> WriteToPubSub(
             topic=output_topic, category=Category.GLOBAL_EVENTS))
        """
        -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. 
        Values used are for testing purposes.
        NB: Using a custom TopFn that will deduplicate k/v pairs
        when using an accumulation strategy: SO - 56616576 @guillem-xercavins
        """
        (inputs
         | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto(
             beam.window.FixedWindows(size=10 * 60),
             trigger=trigger.Repeatedly(trigger.AfterCount(5)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | beam.Map(lambda element: element['group'])
         | beam.ParDo(PairTopicWithOneFn())
         | beam.CombinePerKey(sum)
         | 'Top 10 Topics' >> beam.CombineGlobally(
             TopDistinctFn(
                 n=10, compare=lambda a, b: a[1] < b[1])).without_defaults()
         | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn())
         | 'Publish %s' % 'Topics' >> WriteToPubSub(
             topic=output_topic, category=Category.HOT_TOPICS))

示例#15

0

显示文件

文件： to_visualizer.py 项目： rincon-santi/dataflow_taxis_python

def main(argv=None):
    def json_parser(x):
        parsed = json.loads(x)
        return parsed

    def bye(x):
        logging.info('outing: %s', x)
        return x

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output_topic")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    data = (p
            | 'ReadData' >>
            beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes)
            | "JSONParse" >> beam.Map(json_parser))

    (data
     | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"])
     | "Windowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))),
         accumulation_mode=tr.AccumulationMode.DISCARDING,
         allowed_lateness=0)
     | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye' >> beam.Map(bye)
     | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "SlidWindowing" >> beam.WindowInto(
         window.FixedWindows(60),
         trigger=(tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))),
                                    late=tr.Repeatedly(tr.AfterCount(1)))),
         allowed_lateness=300,
         accumulation_mode=tr.AccumulationMode.ACCUMULATING)
     | "Extract" >> beam.Map(lambda x: x["meter_increment"])
     | "Sum_up" >> beam.CombineGlobally(sum).without_defaults()
     | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x})
     | "Enrich with time data" >> beam.ParDo(Enrich())
     | "ToBytesCount" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye2' >> beam.Map(bye)
     | "WriteCount" >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"])
     | "SessionWindowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))),
         accumulation_mode=tr.AccumulationMode.ACCUMULATING,
         allowed_lateness=0)
     | "GroupInPickup" >> beam.CombinePerKey(PickupFn())
     | "Discarding Key" >> beam.Map(lambda x: x[1])
     | "Filter not pickup" >>
     beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None)
     | "ToBytesPickup" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye3' >> beam.Map(bye)
     | "WritePickup" >> beam.io.WriteToPubSub(TOPIC))

    result = p.run()
    result.wait_until_finish()