Exemplo n.º 1
0
def load(events, metadata=None, pipeline_options=None):
  return (
      events
      | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
      # trigger fires when each sub-triger (executed in order) fires
      # repeatedly 1. after at least maxLogEvents in pane
      #            2. or finally when watermark pass the end of window
      # Repeatedly 1. after at least maxLogEvents in pane
      #            2. or processing time pass the first element in pane + delay
      | 'query10_fix_window' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          trigger=trigger.AfterEach(
              trigger.OrFinally(
                  trigger.Repeatedly(
                      trigger.AfterCount(metadata.get('max_log_events'))),
                  trigger.AfterWatermark()),
              trigger.Repeatedly(
                  trigger.AfterAny(
                      trigger.AfterCount(metadata.get('max_log_events')),
                      trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          # Use a 1 day allowed lateness so that any forgotten hold will stall
          # the pipeline for that period and be very noticeable.
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk' >> beam.GroupByKey()
      | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
      | 'query10_window_log_files' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk_2' >> beam.GroupByKey()
      | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
 def expand(self, pcoll):
     return (pcoll
             | 'LeaderboardTeamFixedWindows' >> beam.WindowInto(
                 beam.window.FixedWindows(self.team_window_duration),
                 trigger=trigger.AfterWatermark(trigger.AfterCount(10),
                                                trigger.AfterCount(20)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
             | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
Exemplo n.º 3
0
 def expand(self, pcoll):
     return (
         pcoll
         # We will get early (speculative) results as well as cumulative
         # processing of late data.
         | 'LeaderboardTeamFixedWindows' >> beam.WindowInto(
             beam.window.FixedWindows(self.team_window_duration),
             trigger=trigger.AfterWatermark(trigger.AfterCount(10),
                                            trigger.AfterCount(20)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
             allowed_lateness=self.allowed_lateness_seconds)
         # Extract and sum teamname/score pairs from the event data.
         | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
Exemplo n.º 4
0
    def expand(self, pcoll):
        logging.info("Calculate group values: {}".format(pcoll))

        return (
            pcoll
            # We will get early (speculative) results as well as cumulative
            # processing of late data.
            | 'HighValueGroupFixedWindows' >> beam.WindowInto(
                beam.window.FixedWindows(self.group_window_duration),
                trigger=trigger.AfterWatermark(trigger.AfterCount(10),
                                               trigger.AfterCount(20)),
                accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
            # Extract and sum group/value pairs from the event data.
            | 'ExtractAndSumValue' >> ExtractAndSumValue('group'))
Exemplo n.º 5
0
 def expand(self, pcoll):
     # NOTE: the behavior does not exactly match the Java example
     # TODO: allowed_lateness not implemented yet in FixedWindows
     # TODO: AfterProcessingTime not implemented yet, replace AfterCount
     return (
         pcoll
         # We will get early (speculative) results as well as cumulative
         # processing of late data.
         | 'LeaderboardTeamFixedWindows' >> beam.WindowInto(
             beam.window.FixedWindows(self.team_window_duration),
             trigger=trigger.AfterWatermark(trigger.AfterCount(10),
                                            trigger.AfterCount(20)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         # Extract and sum teamname/score pairs from the event data.
         | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
Exemplo n.º 6
0
def run_combine(pipeline, input_elements=5, lift_combiners=True):
    # Calculate the expected result, which is the sum of an arithmetic sequence.
    # By default, this is equal to: 0 + 1 + 2 + 3 + 4 = 10
    expected_result = input_elements * (input_elements - 1) / 2

    # Enable runtime type checking in order to cover TypeCheckCombineFn by
    # the test.
    pipeline.get_pipeline_options().view_as(
        TypeOptions).runtime_type_check = True
    pipeline.get_pipeline_options().view_as(
        TypeOptions).allow_unsafe_triggers = True

    with pipeline as p:
        pcoll = p | 'Start' >> beam.Create(range(input_elements))

        # Certain triggers, such as AfterCount, are incompatible with combiner
        # lifting. We can use that fact to prevent combiners from being lifted.
        if not lift_combiners:
            pcoll |= beam.WindowInto(
                window.GlobalWindows(),
                trigger=trigger.AfterCount(input_elements),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)

        # Pass an additional 'None' in order to cover _CurriedFn by the test.
        pcoll |= 'Do' >> beam.CombineGlobally(
            combiners.SingleInputTupleCombineFn(
                CallSequenceEnforcingCombineFn(),
                CallSequenceEnforcingCombineFn()), None).with_fanout(fanout=1)
        assert_that(pcoll, equal_to([(expected_result, expected_result)]))
Exemplo n.º 7
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        # TODO(BEAM-9322): Remove use of this experiment.
        # This flag is only necessary when using the multi-output TestStream b/c
        # it relies on using the PCollection output tags as the PCollection output
        # ids.
        p = TestPipeline(additional_pipeline_args=[
            '--experiments=' + 'passthrough_pcollection_output_ids'
        ])

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
Exemplo n.º 8
0
def load(events, metadata=None, pipeline_options=None):
  num_events_in_pane = 30
  windowed_events = (
      events
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(trigger.AfterCount(num_events_in_pane)),
          accumulation_mode=trigger.AccumulationMode.DISCARDING))
  auction_by_seller_id = (
      windowed_events
      | nexmark_query_util.JustAuctions()
      | 'query3_filter_category' >> beam.Filter(lambda auc: auc.category == 10)
      | 'query3_key_by_seller' >> beam.ParDo(
          nexmark_query_util.AuctionBySellerFn()))
  person_by_id = (
      windowed_events
      | nexmark_query_util.JustPerson()
      | 'query3_filter_region' >>
      beam.Filter(lambda person: person.state in ['OR', 'ID', 'CA'])
      | 'query3_key_by_person_id' >> beam.ParDo(
          nexmark_query_util.PersonByIdFn()))
  return ({
      nexmark_query_util.AUCTION_TAG: auction_by_seller_id,
      nexmark_query_util.PERSON_TAG: person_by_id,
  }
          | beam.CoGroupByKey()
          | 'query3_join' >> beam.ParDo(
              JoinFn(metadata.get('max_auction_waiting_time')))
          | 'query3_output' >> beam.Map(
              lambda t: {
                  ResultNames.NAME: t[1].name,
                  ResultNames.CITY: t[1].city,
                  ResultNames.STATE: t[1].state,
                  ResultNames.AUCTION_ID: t[0].id
              }))
    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        #
        # In the case of dynamic sharding, however, we use a default trigger since
        # the transform performs sharding also batches elements to avoid generating
        # too many tiny files. User trigger is applied right after writes to limit
        # the number of load jobs.
        if self.is_streaming_pipeline and not self.with_auto_sharding:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())
 def expand(self, pcoll):
     return (pcoll
             | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
             | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
Exemplo n.º 11
0
    def test_gbk_execution_after_watermark_trigger(self):
        test_stream = (TestStream()
            .advance_watermark_to(10)
            .add_elements([TimestampedValue('a', 11)])
            .advance_watermark_to(20)
            .add_elements([TimestampedValue('b', 21)])
            .advance_watermark_to_infinity())  # yapf: disable

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        records = (
            p  # pylint: disable=unused-variable
            | test_stream
            | beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.Map(lambda x: ('k', x))
            | beam.GroupByKey())

        # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
        # respect the TimestampCombiner.  The test below should also verify the
        # timestamps of the outputted elements once this is implemented.

        # assert per window
        expected_window_to_elements = {
            window.IntervalWindow(0, 15): [('k', ['a']), ('k', [])],
            window.IntervalWindow(15, 30): [('k', ['b']), ('k', [])],
        }
        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    label='assert per window')

        p.run()
 def expand(self, pcoll):
     return (pcoll
             | 'TweetGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(50)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                 allowed_lateness=self.allowed_lateness_seconds)
             # Extract and sum username/score pairs from the event data.
             | 'ExtractTweets' >> ExtractTweets('user_id'))
Exemplo n.º 13
0
 def expand(self, p):
     # NOTE: allowed_lateness is not yet available in Python FixedWindows.
     # NOTE: AfterProcessingTime not yet available in Python.
     return (
         p
         | 'window' >> beam.WindowInto(
             beam.window.GlobalWindows(),
             trigger=trigger.AfterWatermark(early=trigger.AfterCount(100)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'extract_user_score' >> ExtractAndSumScore('user'))
Exemplo n.º 14
0
 def expand(self, pcoll):
     return (pcoll
             # Get periodic results every ten events.
             | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                 allowed_lateness=self.allowed_lateness_seconds)
             # Extract and sum username/score pairs from the event data.
             | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
Exemplo n.º 15
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
Exemplo n.º 16
0
  def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
    outputs = (
        destination_data_kv_pc
        | beam.ParDo(
            WriteRecordsToFile(
                schema=self.schema,
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs).with_outputs(
                WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                WriteRecordsToFile.WRITTEN_FILE_TAG))

    # A PCollection of (destination, file) tuples. It lists files with records,
    # and the destination each file is meant to be imported into.
    destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

    # A PCollection of (destination, record) tuples. These are later sharded,
    # grouped, and all records for each destination-shard is written to files.
    # This PCollection is necessary because not all records can be written into
    # files in ``WriteRecordsToFile``.
    unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

    more_destination_files_kv_pc = (
        unwritten_records_pc
        | beam.ParDo(_ShardDestinations())
        | "GroupShardedRows" >> beam.GroupByKey()
        | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
        | "WriteGroupedRecordsToFile" >> beam.ParDo(
            WriteGroupedRecordsToFile(
                schema=self.schema, file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs))

    # TODO(BEAM-9494): Remove the identity transform. We flatten both
    # PCollection paths and use an identity function to work around a
    # flatten optimization issue where the wrong coder is being used.
    all_destination_file_pairs_pc = (
        (destination_files_kv_pc, more_destination_files_kv_pc)
        | "DestinationFilesUnion" >> beam.Flatten()
        | "IdentityWorkaround" >> beam.Map(lambda x: x))

    if self.is_streaming_pipeline:
      # Apply the user's trigger back before we start triggering load jobs
      all_destination_file_pairs_pc = (
          all_destination_file_pairs_pc
          | "ApplyUserTrigger" >> beam.WindowInto(
              beam.window.GlobalWindows(),
              trigger=trigger.Repeatedly(
                  trigger.AfterAll(
                      trigger.AfterProcessingTime(self.triggering_frequency),
                      trigger.AfterCount(1))),
              accumulation_mode=trigger.AccumulationMode.DISCARDING))
    return all_destination_file_pairs_pc
Exemplo n.º 17
0
    def expand(self, pcoll):
        logging.info("Calculate user values: {}".format(pcoll))

        return (pcoll
                # Get periodic results every ten events.
                | 'HighValueUserGlobalWindows' >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
                # Extract and sum username/value pairs from the event data.
                | 'ExtractAndSumValue' >> ExtractAndSumValue('user'))
Exemplo n.º 18
0
 def expand(self, pcoll):
     # NOTE: the behavior does not exactly match the Java example
     # TODO: allowed_lateness not implemented yet in FixedWindows
     # TODO: AfterProcessingTime not implemented yet, replace AfterCount
     return (pcoll
             # Get periodic results every ten events.
             | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
             # Extract and sum username/score pairs from the event data.
             | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
Exemplo n.º 19
0
 def _maybe_apply_user_trigger(self, destination_file_kv_pc):
     if self.is_streaming_pipeline:
         # Apply the user's trigger back before we start triggering load jobs
         return (destination_file_kv_pc
                 | "ApplyUserTrigger" >> beam.WindowInto(
                     beam.window.GlobalWindows(),
                     trigger=trigger.Repeatedly(
                         trigger.AfterAll(
                             trigger.AfterProcessingTime(
                                 self.triggering_frequency),
                             trigger.AfterCount(1))),
                     accumulation_mode=trigger.AccumulationMode.DISCARDING))
     else:
         return destination_file_kv_pc
Exemplo n.º 20
0
    def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
        outputs = (destination_data_kv_pc
                   | beam.ParDo(WriteRecordsToFile(
                       max_files_per_bundle=self.max_files_per_bundle,
                       max_file_size=self.max_file_size,
                       coder=self.coder),
                                file_prefix=file_prefix_pcv).with_outputs(
                                    WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                                    WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        if self.is_streaming_pipeline:
            # Apply the user's trigger back before we start triggering load jobs
            all_destination_file_pairs_pc = (
                all_destination_file_pairs_pc
                | "ApplyUserTrigger" >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(
                        trigger.AfterAll(
                            trigger.AfterProcessingTime(
                                self.triggering_frequency),
                            trigger.AfterCount(1))),
                    accumulation_mode=trigger.AccumulationMode.DISCARDING))
        return all_destination_file_pairs_pc
Exemplo n.º 21
0
def load(events, metadata=None):
    # find winning bids for each closed auction
    return (events
            # find winning bids
            | beam.Filter(nexmark_query_util.auction_or_bid)
            | winning_bids.WinningBids()
            # (auction_bids -> (aution.seller, bid)
            | beam.Map(lambda auc_bid: (auc_bid.auction.seller, auc_bid.bid))
            # calculate and output mean as data arrives
            | beam.WindowInto(
                window.GlobalWindows(),
                trigger=trigger.Repeatedly(trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                allowed_lateness=0)
            | beam.CombinePerKey(MovingMeanSellingPriceFn(10))
            | beam.Map(lambda t: {
                ResultNames.SELLER: t[0],
                ResultNames.PRICE: t[1]
            }))
Exemplo n.º 22
0
def load(events, metadata=None, pipeline_options=None):

    return (events
            # filter to get only bids and then extract bidder id
            | nexmark_query_util.JustBids()
            | 'query11_extract_bidder' >> beam.Map(lambda bid: bid.bidder)
            # window auction and key by auctions' seller
            | 'query11_session_window' >> beam.WindowInto(
                window.Sessions(metadata.get('window_size_sec')),
                trigger=trigger.AfterWatermark(
                    early=trigger.AfterCount(metadata.get('max_log_events'))),
                accumulation_mode=trigger.AccumulationMode.DISCARDING,
                allowed_lateness=metadata.get('occasional_delay_sec') // 2)
            # count per bidder
            | beam.combiners.Count.PerElement()
            | beam.Map(
                lambda bidder_count: {
                    ResultNames.BIDDER_ID: bidder_count[0],
                    ResultNames.BID_COUNT: bidder_count[1]
                }))
Exemplo n.º 23
0
 def test(self):
   _ = (
       self.pipeline
       | 'Read from pubsub' >> ReadFromPubSub(
           subscription=self.read_sub_name,
           with_attributes=True,
           id_label='id',
       )
       | beam.Map(lambda x: bytes(1)).with_output_types(bytes)
       | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
       | 'Window' >> beam.WindowInto(
           window.GlobalWindows(),
           trigger=trigger.Repeatedly(
               trigger.AfterCount(self.num_of_messages)),
           accumulation_mode=trigger.AccumulationMode.DISCARDING)
       | 'Count messages' >> beam.CombineGlobally(
           beam.combiners.CountCombineFn()).without_defaults().
       with_output_types(int)
       | 'Convert to bytes' >>
       beam.Map(lambda count: str(count).encode('utf-8'))
       | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
Exemplo n.º 24
0
    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        if self.is_streaming_pipeline:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())
Exemplo n.º 25
0
def run():
    options = PipelineOptions([
        "--runner=PortableRunner", "--job_endpoint=localhost:8099",
        "--environment_type=LOOPBACK"
    ])
    # options = PipelineOptions([
    #     "--runner=FlinkRunner",
    #     "--flink_master=localhost:8081",
    # ])
    with beam.Pipeline(options=options) as p:
        (p | 'ReadFromKafka' >> ReadFromKafka(
            consumer_config={"bootstrap.servers": "localhost:9092"},
            topics=["beam-input"])
         | 'ExtractWords' >>
         beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1]))
         | 'Window' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.Repeatedly(trigger.AfterCount(1)),
             accumulation_mode=AccumulationMode.ACCUMULATING)
         | 'Count' >> beam.combiners.Count.PerElement()
         | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                                (word_count[0], word_count[1]))
         | 'Log' >> beam.ParDo(LoggingDoFn()))
Exemplo n.º 26
0
  def test_gbk_execution_after_watermark_trigger(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a'])
                   .advance_watermark_to(20))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result   # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p            # pylint: disable=unused-variable
               | test_stream
               | beam.WindowInto(
                   FixedWindows(15),
                   trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING)
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed.
    # assert_that(records, equal_to([
    #     ('k', ['a']), ('k', [])]))

    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('k', ['a']), ('k', [])], result)
Exemplo n.º 27
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    (p | 'ReadFromKafka' >> ReadFromKafka(
        consumer_config={"bootstrap.servers": "localhost:9092"},
        topics=["beam-input"])
     | 'ExtractWords' >> beam.FlatMap(lambda
                                      (k, v): re.findall(r'[A-Za-z\']+', v))
     | 'Window' >> beam.WindowInto(
         window.GlobalWindows(),
         trigger=trigger.Repeatedly(trigger.AfterCount(1)),
         accumulation_mode=AccumulationMode.ACCUMULATING)
     | 'Count' >> beam.combiners.Count.PerElement()
     | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                            (word_count[0], word_count[1]))
     | 'Log' >> beam.ParDo(LoggingDoFn()))

    result = p.run()
    result.wait_until_finish()
Exemplo n.º 28
0
def run(argv=None):
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input', default=TW_INPUT)
            parser.add_argument('--output', default=TW_OUTPUT)

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.staging_location = STAGING_LOCATION
    google_cloud_options.temp_location = TEMP_LOCATION
    google_cloud_options.flexrs_goal = 'COST_OPTIMIZED'
    # google_cloud_options.job_name = 'hashtags-battle-job'
    """
    -> Uncomment this to run the pipeline on the Cloud Dataflow runner.
    $ python main.py --setup_file ./setup.py --machine_type=n1-standard-2 --max_num_workers=2 --disk_size_gb=30
    """
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        my_options = options.view_as(MyOptions)
        input_topic = my_options.input
        output_topic = my_options.output
        """
        -> Consumes/collects events sent by the input Pub/Sub topic.
        @: id_label argument is a unique identifier used by the pipeline to
        deduplicate events : Exactly-once semantic.
        """
        inputs = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic=input_topic,
                            # id_label='event_id'
                    ).with_output_types(six.binary_type)
             | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8'))
             | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element)))
        # | 'Add Event Time' >> beam.ParDo(AddTimestampFn())
        """
        -> Extracts hashtags array from object.
        """
        hashtags = \
            (inputs
             | 'Get Hashtags' >> beam.Map(lambda element: element['hashtags'])
             | 'Explode Hashtags' >> beam.FlatMap(lambda element: element))
        """
        -> Outputs a batch of pre-aggregated hashtags.
        Triggering early results from the window every X seconds (processing time trigger)
        or triggering when the current pane has collected at least N elements (data-driven trigger)
        Values used are for testing purposes.
        """
        (hashtags
         | 'Apply Daily Window' >> beam.WindowInto(
             beam.window.FixedWindows(SECONDS_IN_1_DAY),
             trigger=trigger.Repeatedly(trigger.AfterCount(10)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Grouping Hashtags' >> PairWithOneCombine()
         | 'Format Hashtags' >> beam.ParDo(FormatHashtagFn())
         | 'Batch Hashtags' >> beam.BatchElements(min_batch_size=49,
                                                  max_batch_size=50)
         | 'Publish Hashtags' >> WriteToPubSub(
             topic=output_topic, category=Category.DAILY_HASHTAGS))
        """
        -> Outputs the sum of processed events for a given fixed-time window.
        """
        (hashtags
         | 'Apply 5 Minutes' >> beam.WindowInto(
             beam.window.FixedWindows(size=5 * 60),
             trigger=trigger.Repeatedly(trigger.AfterCount(20)),
             accumulation_mode=trigger.AccumulationMode.DISCARDING)
         | 'CG+CC' >> beam.CombineGlobally(
             beam.combiners.CountCombineFn()).without_defaults()
         | 'Publish Events Sum' >> WriteToPubSub(
             topic=output_topic, category=Category.GLOBAL_EVENTS))
        """
        -> Outputs the top 5 trending hashtags within a given fixed-time window.
        """
        (hashtags
         | 'Apply %s Min FW' % '30' >> beam.WindowInto(
             beam.window.FixedWindows(size=SECONDS_IN_HALF_HOUR),
             trigger=trigger.Repeatedly(trigger.AfterCount(2)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Grouping Trends' >> PairWithOneCombine()
         | '%s Trending Hashtags' % TRENDING_HASHTAGS_LIMIT >>
         beam.CombineGlobally(
             TopDistinctFn(
                 n=TRENDING_HASHTAGS_LIMIT,
                 compare=lambda a, b: a[1] < b[1])).without_defaults()
         | 'Format Trending Hashtags' >> beam.ParDo(FormatHashtagsFn())
         | 'Publish Trending Hashtags' >> WriteToPubSub(
             topic=output_topic, category=Category.TRENDING_HASHTAGS))
Exemplo n.º 29
0
    def test_multiple_outputs_with_watermark_advancement(self):
        """Tests that the TestStream can independently control output watermarks."""

        # Purposely set the watermark of numbers to 20 then letters to 5 to test
        # that the watermark advancement is per PCollection.
        #
        # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
        # emitted at different times so that they will have different windows. The
        # watermark advancement is checked by checking their windows. If the
        # watermark does not advance, then the windows will be [-inf, -inf). If the
        # windows do not advance separately, then the PCollections will both
        # windowed in [15, 30).
        letters_elements = [
            TimestampedValue('a', 6),
            TimestampedValue('b', 7),
            TimestampedValue('c', 8),
        ]
        numbers_elements = [
            TimestampedValue('1', 21),
            TimestampedValue('2', 22),
            TimestampedValue('3', 23),
        ]
        test_stream = (TestStream().advance_watermark_to(
            0, tag='letters').advance_watermark_to(
                0, tag='numbers').advance_watermark_to(
                    20, tag='numbers').advance_watermark_to(
                        5, tag='letters').add_elements(
                            letters_elements,
                            tag='letters').advance_watermark_to(
                                10, tag='letters').add_elements(
                                    numbers_elements,
                                    tag='numbers').advance_watermark_to(
                                        30, tag='numbers'))

        options = StandardOptions(streaming=True)
        p = TestPipeline(is_integration_test=True, options=options)

        main = p | test_stream

        # Use an AfterWatermark trigger with an early firing to test that the
        # watermark is advancing properly and that the element is being emitted in
        # the correct window.
        letters = (
            main['letters']
            | 'letter windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'letter with key' >> beam.Map(lambda x: ('k', x))
            | 'letter gbk' >> beam.GroupByKey())

        numbers = (
            main['numbers']
            | 'number windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'number with key' >> beam.Map(lambda x: ('k', x))
            | 'number gbk' >> beam.GroupByKey())

        # The letters were emitted when the watermark was at 5, thus we expect to
        # see the elements in the [0, 15) window. We used an early trigger to make
        # sure that the ON_TIME empty pane was also emitted with a TestStream.
        # This pane has no data because of the early trigger causes the elements to
        # fire before the end of the window and because the accumulation mode
        # discards any data after the trigger fired.
        expected_letters = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', []),
            ],
        }

        # Same here, except the numbers were emitted at watermark = 20, thus they
        # are in the [15, 30) window.
        expected_numbers = {
            window.IntervalWindow(15, 30): [
                ('k', ['1', '2', '3']),
                ('k', []),
            ],
        }
        assert_that(letters,
                    equal_to_per_window(expected_letters),
                    label='letters assert per window')
        assert_that(numbers,
                    equal_to_per_window(expected_numbers),
                    label='numbers assert per window')

        p.run()
Exemplo n.º 30
0
def run(argv=None):
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument(
                '--input',
                default='projects/notbanana-7f869/topics/rsvps_source')
            parser.add_argument(
                '--output',
                default='projects/notbanana-7f869/topics/rsvps_out')

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'notbanana-7f869'
    google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging'
    google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp'
    google_cloud_options.job_name = 'demo-job'
    """
    -> Run the pipeline on the Cloud Dataflow runner.
    $ python pipelines/main.py --setup_file path/to/setup.py
    """
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        my_options = options.view_as(MyOptions)
        input_topic = my_options.input
        output_topic = my_options.output
        """
        -> Consumes/collects events sent by the input Pub/Sub topic.
        @: id_label argument is a unique identifier used by the pipeline to
        deduplicate events : Exactly-once semantic.
        """
        inputs = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic=input_topic,
                            # id_label='event_id'
                    ).with_output_types(six.binary_type)
             | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8'))
             | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element))
             | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn()))
        """ 
        -> Outputs the total number of events globally processed by the pipeline.
        Triggering early results from the window every X seconds (processing time trigger)
        or triggering when the current pane has collected at least N elements (data-driven trigger)
        Values used are for testing purposes.
        """
        (inputs
         | 'Apply Global Window' >> beam.WindowInto(
             beam.window.GlobalWindows(),
             trigger=trigger.Repeatedly(
                 trigger.AfterAny(
                     trigger.AfterCount(2),
                     # AfterProcessingTime is experimental.
                     # Not implemented yet.
                     trigger.AfterProcessingTime(30))),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Count events globally' >> beam.CombineGlobally(
             beam.combiners.CountCombineFn()).without_defaults()
         | 'Publish %s' % 'Events' >> WriteToPubSub(
             topic=output_topic, category=Category.GLOBAL_EVENTS))
        """
        -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. 
        Values used are for testing purposes.
        NB: Using a custom TopFn that will deduplicate k/v pairs
        when using an accumulation strategy: SO - 56616576 @guillem-xercavins
        """
        (inputs
         | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto(
             beam.window.FixedWindows(size=10 * 60),
             trigger=trigger.Repeatedly(trigger.AfterCount(5)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | beam.Map(lambda element: element['group'])
         | beam.ParDo(PairTopicWithOneFn())
         | beam.CombinePerKey(sum)
         | 'Top 10 Topics' >> beam.CombineGlobally(
             TopDistinctFn(
                 n=10, compare=lambda a, b: a[1] < b[1])).without_defaults()
         | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn())
         | 'Publish %s' % 'Topics' >> WriteToPubSub(
             topic=output_topic, category=Category.HOT_TOPICS))