def load(events, metadata=None, pipeline_options=None): return ( events | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn()) # trigger fires when each sub-triger (executed in order) fires # repeatedly 1. after at least maxLogEvents in pane # 2. or finally when watermark pass the end of window # Repeatedly 1. after at least maxLogEvents in pane # 2. or processing time pass the first element in pane + delay | 'query10_fix_window' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), trigger=trigger.AfterEach( trigger.OrFinally( trigger.Repeatedly( trigger.AfterCount(metadata.get('max_log_events'))), trigger.AfterWatermark()), trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(metadata.get('max_log_events')), trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))), accumulation_mode=trigger.AccumulationMode.DISCARDING, # Use a 1 day allowed lateness so that any forgotten hold will stall # the pipeline for that period and be very noticeable. allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk' >> beam.GroupByKey() | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options) | 'query10_window_log_files' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk_2' >> beam.GroupByKey() | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
def load(events, metadata=None, pipeline_options=None): num_events_in_pane = 30 windowed_events = ( events | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(num_events_in_pane)), accumulation_mode=trigger.AccumulationMode.DISCARDING)) auction_by_seller_id = ( windowed_events | nexmark_query_util.JustAuctions() | 'query3_filter_category' >> beam.Filter(lambda auc: auc.category == 10) | 'query3_key_by_seller' >> beam.ParDo( nexmark_query_util.AuctionBySellerFn())) person_by_id = ( windowed_events | nexmark_query_util.JustPerson() | 'query3_filter_region' >> beam.Filter(lambda person: person.state in ['OR', 'ID', 'CA']) | 'query3_key_by_person_id' >> beam.ParDo( nexmark_query_util.PersonByIdFn())) return ({ nexmark_query_util.AUCTION_TAG: auction_by_seller_id, nexmark_query_util.PERSON_TAG: person_by_id, } | beam.CoGroupByKey() | 'query3_join' >> beam.ParDo( JoinFn(metadata.get('max_auction_waiting_time'))) | 'query3_output' >> beam.Map( lambda t: { ResultNames.NAME: t[1].name, ResultNames.CITY: t[1].city, ResultNames.STATE: t[1].state, ResultNames.AUCTION_ID: t[0].id }))
def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. # # In the case of dynamic sharding, however, we use a default trigger since # the transform performs sharding also batches elements to avoid generating # too many tiny files. User trigger is applied right after writes to limit # the number of load jobs. if self.is_streaming_pipeline and not self.with_auto_sharding: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def expand(self, pcoll): return (pcoll | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
def expand(self, pcoll): return (pcoll | 'TweetGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(50)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=self.allowed_lateness_seconds) # Extract and sum username/score pairs from the event data. | 'ExtractTweets' >> ExtractTweets('user_id'))
def expand(self, pcoll): return (pcoll # Get periodic results every ten events. | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=self.allowed_lateness_seconds) # Extract and sum username/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = ( destination_data_kv_pc | beam.ParDo( WriteRecordsToFile( schema=self.schema, max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile( schema=self.schema, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs)) # TODO(BEAM-9494): Remove the identity transform. We flatten both # PCollection paths and use an identity function to work around a # flatten optimization issue where the wrong coder is being used. all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten() | "IdentityWorkaround" >> beam.Map(lambda x: x)) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime(self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def expand(self, pcoll): logging.info("Calculate user values: {}".format(pcoll)) return (pcoll # Get periodic results every ten events. | 'HighValueUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # Extract and sum username/value pairs from the event data. | 'ExtractAndSumValue' >> ExtractAndSumValue('user'))
def expand(self, pcoll): # NOTE: the behavior does not exactly match the Java example # TODO: allowed_lateness not implemented yet in FixedWindows # TODO: AfterProcessingTime not implemented yet, replace AfterCount return (pcoll # Get periodic results every ten events. | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # Extract and sum username/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
def _maybe_apply_user_trigger(self, destination_file_kv_pc): if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs return (destination_file_kv_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) else: return destination_file_kv_pc
def load(events, metadata=None): return ( events | nexmark_query_util.JustBids() | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder) # windowing with processing time trigger, currently not supported in batch | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterProcessingTime(metadata.get('window_size_sec'))), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=0) | 'query12_bid_count' >> beam.combiners.Count.PerElement() | 'query12_output' >> beam.Map( lambda t: { ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1] }))
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = (destination_data_kv_pc | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def load(events, metadata=None): # find winning bids for each closed auction return (events # find winning bids | beam.Filter(nexmark_query_util.auction_or_bid) | winning_bids.WinningBids() # (auction_bids -> (aution.seller, bid) | beam.Map(lambda auc_bid: (auc_bid.auction.seller, auc_bid.bid)) # calculate and output mean as data arrives | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=0) | beam.CombinePerKey(MovingMeanSellingPriceFn(10)) | beam.Map(lambda t: { ResultNames.SELLER: t[0], ResultNames.PRICE: t[1] }))
def test(self): _ = ( self.pipeline | 'Read from pubsub' >> ReadFromPubSub( subscription=self.read_sub_name, with_attributes=True, id_label='id', ) | beam.Map(lambda x: bytes(1)).with_output_types(bytes) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterCount(self.num_of_messages)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'Count messages' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults(). with_output_types(int) | 'Convert to bytes' >> beam.Map(lambda count: str(count).encode('utf-8')) | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. if self.is_streaming_pipeline: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def run(): options = PipelineOptions([ "--runner=PortableRunner", "--job_endpoint=localhost:8099", "--environment_type=LOOPBACK" ]) # options = PipelineOptions([ # "--runner=FlinkRunner", # "--flink_master=localhost:8081", # ]) with beam.Pipeline(options=options) as p: (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1])) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn()))
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda (k, v): re.findall(r'[A-Za-z\']+', v)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn())) result = p.run() result.wait_until_finish()
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic or not known_args.play_topic): logging.fatal('topic and play_topic are required.') events = (p | 'read_events' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse_events' >> beam.ParDo(ParseEventFn())) play_events = ( p | 'read_play_events' >> ReadFromPubSub( topic=known_args.play_topic, timestamp_attribute='timestamp_ms') | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn())) sessionized_events = ( events | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x)) | 'sessionize_events' >> beam.WindowInto( window.Sessions(float(known_args.session_gap)))) sessionized_plays = ( play_events | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x)) | 'sessionize_plays' >> beam.WindowInto( window.Sessions(float(known_args.session_gap)))) per_user_latency = ({ 'plays': sessionized_plays, 'events': sessionized_events } | 'cbk' >> beam.CoGroupByKey() | 'compute_latency' >> beam.ParDo(ComputeLatency())) mean_latency = ( per_user_latency | 'extract_latencies' >> beam.Values() | 'global_window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1000)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn( )).with_fanout(16).as_singleton_view()) _ = (per_user_latency | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(), mean_latency=mean_latency) | 'filter_duplicates' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterCount(1), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'to_bq_schema' >> beam.Map(lambda x: {'user': x}) | 'write_bad_users' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, ('user:string'))) p.run().wait_until_finish()
def run(argv=None): class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument( '--input', default='projects/notbanana-7f869/topics/rsvps_source') parser.add_argument( '--output', default='projects/notbanana-7f869/topics/rsvps_out') options = PipelineOptions(flags=argv) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'notbanana-7f869' google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging' google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp' google_cloud_options.job_name = 'demo-job' """ -> Run the pipeline on the Cloud Dataflow runner. $ python pipelines/main.py --setup_file path/to/setup.py """ # options.view_as(StandardOptions).runner = 'DataflowRunner' with beam.Pipeline(options=options) as p: my_options = options.view_as(MyOptions) input_topic = my_options.input output_topic = my_options.output """ -> Consumes/collects events sent by the input Pub/Sub topic. @: id_label argument is a unique identifier used by the pipeline to deduplicate events : Exactly-once semantic. """ inputs = \ (p | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub( topic=input_topic, # id_label='event_id' ).with_output_types(six.binary_type) | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8')) | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element)) | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn())) """ -> Outputs the total number of events globally processed by the pipeline. Triggering early results from the window every X seconds (processing time trigger) or triggering when the current pane has collected at least N elements (data-driven trigger) Values used are for testing purposes. """ (inputs | 'Apply Global Window' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(2), # AfterProcessingTime is experimental. # Not implemented yet. trigger.AfterProcessingTime(30))), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Count events globally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'Publish %s' % 'Events' >> WriteToPubSub( topic=output_topic, category=Category.GLOBAL_EVENTS)) """ -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. Values used are for testing purposes. NB: Using a custom TopFn that will deduplicate k/v pairs when using an accumulation strategy: SO - 56616576 @guillem-xercavins """ (inputs | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto( beam.window.FixedWindows(size=10 * 60), trigger=trigger.Repeatedly(trigger.AfterCount(5)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | beam.Map(lambda element: element['group']) | beam.ParDo(PairTopicWithOneFn()) | beam.CombinePerKey(sum) | 'Top 10 Topics' >> beam.CombineGlobally( TopDistinctFn( n=10, compare=lambda a, b: a[1] < b[1])).without_defaults() | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn()) | 'Publish %s' % 'Topics' >> WriteToPubSub( topic=output_topic, category=Category.HOT_TOPICS))
def run(argv=None): class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', default=TW_INPUT) parser.add_argument('--output', default=TW_OUTPUT) options = PipelineOptions(flags=argv) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.staging_location = STAGING_LOCATION google_cloud_options.temp_location = TEMP_LOCATION google_cloud_options.flexrs_goal = 'COST_OPTIMIZED' # google_cloud_options.job_name = 'hashtags-battle-job' """ -> Uncomment this to run the pipeline on the Cloud Dataflow runner. $ python main.py --setup_file ./setup.py --machine_type=n1-standard-2 --max_num_workers=2 --disk_size_gb=30 """ # options.view_as(StandardOptions).runner = 'DataflowRunner' with beam.Pipeline(options=options) as p: my_options = options.view_as(MyOptions) input_topic = my_options.input output_topic = my_options.output """ -> Consumes/collects events sent by the input Pub/Sub topic. @: id_label argument is a unique identifier used by the pipeline to deduplicate events : Exactly-once semantic. """ inputs = \ (p | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub( topic=input_topic, # id_label='event_id' ).with_output_types(six.binary_type) | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8')) | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element))) # | 'Add Event Time' >> beam.ParDo(AddTimestampFn()) """ -> Extracts hashtags array from object. """ hashtags = \ (inputs | 'Get Hashtags' >> beam.Map(lambda element: element['hashtags']) | 'Explode Hashtags' >> beam.FlatMap(lambda element: element)) """ -> Outputs a batch of pre-aggregated hashtags. Triggering early results from the window every X seconds (processing time trigger) or triggering when the current pane has collected at least N elements (data-driven trigger) Values used are for testing purposes. """ (hashtags | 'Apply Daily Window' >> beam.WindowInto( beam.window.FixedWindows(SECONDS_IN_1_DAY), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Grouping Hashtags' >> PairWithOneCombine() | 'Format Hashtags' >> beam.ParDo(FormatHashtagFn()) | 'Batch Hashtags' >> beam.BatchElements(min_batch_size=49, max_batch_size=50) | 'Publish Hashtags' >> WriteToPubSub( topic=output_topic, category=Category.DAILY_HASHTAGS)) """ -> Outputs the sum of processed events for a given fixed-time window. """ (hashtags | 'Apply 5 Minutes' >> beam.WindowInto( beam.window.FixedWindows(size=5 * 60), trigger=trigger.Repeatedly(trigger.AfterCount(20)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'CG+CC' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'Publish Events Sum' >> WriteToPubSub( topic=output_topic, category=Category.GLOBAL_EVENTS)) """ -> Outputs the top 5 trending hashtags within a given fixed-time window. """ (hashtags | 'Apply %s Min FW' % '30' >> beam.WindowInto( beam.window.FixedWindows(size=SECONDS_IN_HALF_HOUR), trigger=trigger.Repeatedly(trigger.AfterCount(2)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Grouping Trends' >> PairWithOneCombine() | '%s Trending Hashtags' % TRENDING_HASHTAGS_LIMIT >> beam.CombineGlobally( TopDistinctFn( n=TRENDING_HASHTAGS_LIMIT, compare=lambda a, b: a[1] < b[1])).without_defaults() | 'Format Trending Hashtags' >> beam.ParDo(FormatHashtagsFn()) | 'Publish Trending Hashtags' >> WriteToPubSub( topic=output_topic, category=Category.TRENDING_HASHTAGS))
def run(): pipeline_options = PipelineOptions( ["--runner=DirectRunner", "--streaming"]) p = beam.Pipeline(options=pipeline_options) # read topic_path = "projects/qwiklabs-gcp-34125c5e4e40e9e3/topics/pycon30-file" # replace topic with yours lines = (p | 'read' >> beam.io.ReadFromPubSub(topic=topic_path, with_attributes=True)) # format message def format_message(message, timestamp=beam.DoFn.TimestampParam): message = json.loads(message.data) formatted_message = { 'data': message.get('data'), 'timestamp': float(message.get('event_time')) } return formatted_message formatted = lines | beam.Map(format_message) # windowed = formatted | beam.WindowInto(beam.window.FixedWindows(5)) # windowed = formatted | beam.WindowInto(beam.window.SlidingWindows(60, 5)) windowed = formatted | beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # split words def find_words(element): import re return re.findall(r'[A-Za-z\']+', element.get('data')) words = (windowed | 'split' >> (beam.FlatMap(find_words))) # count words def count_ones(word_ones): (word, ones) = word_ones return word, sum(ones) counts = (words | 'pair' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # aggr to list def aggr_to_list(values): try: if not values: return values elif isinstance(values, _ReiterableChain): return [x for x in values] elif len(values) == 1: return values[0] else: if isinstance(values[0], list): return values[0] + [values[1]] else: return [x for x in values] except Exception: print(values) pass aggred_list = counts | 'sort' >> beam.CombineGlobally( aggr_to_list).without_defaults() # out aggred_list | 'out' >> beam.Map( lambda x: logging.info(sorted(x, key=lambda x: x[1], reverse=True))) result = p.run() result.wait_until_finish()