def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. # # In the case of dynamic sharding, however, we use a default trigger since # the transform performs sharding also batches elements to avoid generating # too many tiny files. User trigger is applied right after writes to limit # the number of load jobs. if self.is_streaming_pipeline and not self.with_auto_sharding: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def test_gbk_execution_after_processing_trigger_fired(self): """Advance TestClock to (X + delta) and see the pipeline does finish.""" # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired # Advance TestClock to (X + delta) and see the pipeline does finish # Possibly to the framework trigger_transcripts.yaml test_stream = (TestStream().advance_watermark_to(10).add_elements([ 'a' ]).advance_processing_time(5.1).advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto( beam.window.FixedWindows(15), trigger=trigger.AfterProcessingTime(5), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. expected_window_to_elements = { window.IntervalWindow(0, 15): [('k', ['a'])], } assert_that(records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input Pub/Sub subscription to read from.') parser.add_argument('--output', dest='output', required=True, help='Output BigQuery table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. (p | 'read' >> ReadFromPubSub(subscription=known_args.input) | 'extract words' >> beam.FlatMap(extract_words) | 'transform to kv' >> beam.Map(lambda x: (x,1)) | 'window per minute' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterProcessingTime(delay=10), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group by words' >> beam.GroupByKey() | 'count ones' >> beam.Map(count_ones) | 'format for bq' >> beam.Map(format_for_bigquery) | 'write to bigquery' >> WriteToBigQuery(table=known_args.output)) result = p.run() result.wait_until_finish()
def load(events, metadata=None, pipeline_options=None): return ( events | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn()) # trigger fires when each sub-triger (executed in order) fires # repeatedly 1. after at least maxLogEvents in pane # 2. or finally when watermark pass the end of window # Repeatedly 1. after at least maxLogEvents in pane # 2. or processing time pass the first element in pane + delay | 'query10_fix_window' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), trigger=trigger.AfterEach( trigger.OrFinally( trigger.Repeatedly( trigger.AfterCount(metadata.get('max_log_events'))), trigger.AfterWatermark()), trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(metadata.get('max_log_events')), trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))), accumulation_mode=trigger.AccumulationMode.DISCARDING, # Use a 1 day allowed lateness so that any forgotten hold will stall # the pipeline for that period and be very noticeable. allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk' >> beam.GroupByKey() | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options) | 'query10_window_log_files' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk_2' >> beam.GroupByKey() | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
def test_on_pane_watermark_hold_no_pipeline_stall(self): """A regression test added for https://issues.apache.org/jira/browse/BEAM-10054.""" START_TIMESTAMP = 1534842000 test_stream = TestStream() test_stream.add_elements(['a']) test_stream.advance_processing_time(START_TIMESTAMP + 1) test_stream.advance_watermark_to(START_TIMESTAMP + 1) test_stream.add_elements(['b']) test_stream.advance_processing_time(START_TIMESTAMP + 2) test_stream.advance_watermark_to(START_TIMESTAMP + 2) with TestPipeline(options=PipelineOptions(['--streaming'])) as p: # pylint: disable=expression-not-assigned (p | 'TestStream' >> test_stream | 'timestamp' >> beam.Map( lambda x: beam.window.TimestampedValue(x, START_TIMESTAMP)) | 'kv' >> beam.Map(lambda x: (x, x)) | 'window_1m' >> beam.WindowInto( beam.window.FixedWindows(60), trigger=trigger.AfterAny(trigger.AfterProcessingTime(3600), trigger.AfterWatermark()), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group_by_key' >> beam.GroupByKey() | 'filter' >> beam.Map(lambda x: x))
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True offer_stat_pipeline_options = pipeline_options.view_as( OfferStatPipelineOptions) p = beam.Pipeline(options=pipeline_options) p | "Read account offer from PS" >> beam.io.ReadFromPubSub(topic=offer_stat_pipeline_options.account_offers_topic) \ | "Parse message" >> beam.ParDo(PubsubMessageParser()) \ | "Windowing" >> beam.WindowInto(window.FixedWindows(60), trigger=trigger.AfterWatermark(early=trigger.AfterProcessingTime(20)), accumulation_mode=AccumulationMode.ACCUMULATING) \ | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer)) \ | beam.GroupByKey() \ | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount()) \ | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()) \ | 'Writing offers to BQ' >> beam.io.WriteToBigQuery(table=offer_stat_pipeline_options.offer_stat_bq_table, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND, schema=OFFER_STAT_BQ_SCHEMA) result = p.run() result.wait_until_finish()
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = ( destination_data_kv_pc | beam.ParDo( WriteRecordsToFile( schema=self.schema, max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile( schema=self.schema, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs)) # TODO(BEAM-9494): Remove the identity transform. We flatten both # PCollection paths and use an identity function to work around a # flatten optimization issue where the wrong coder is being used. all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten() | "IdentityWorkaround" >> beam.Map(lambda x: x)) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime(self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def run(argv=None): # Use Python argparse module to parse custom arguments parser = argparse.ArgumentParser() parser.add_argument('--network') parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output', dest='output', help='Output file to write results to.') parser.add_argument('--output_topic', dest='out_topic', help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument('--input_topic', dest='in_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) known_args, pipeline_args = parser.parse_known_args(argv) p_options = PipelineOptions(pipeline_args) google_cloud_options = p_options.view_as(GoogleCloudOptions) google_cloud_options.region = 'europe-west1' google_cloud_options.project = 'smartlive' '''google_cloud_options.job_name = 'dataflow-job-{}'.format( datetime.datetime.now().strftime("%Y-%m-%d%H%M%S") )''' google_cloud_options.staging_location = 'gs://rim-bucket/binaries' google_cloud_options.temp_location = 'gs://rim-bucket/temp' p_options.view_as(StandardOptions).runner = 'DirectRunner' p_options.view_as(SetupOptions).save_main_session = True p_options.view_as(StandardOptions).streaming = True p_options.view_as(WorkerOptions).subnetwork = ( 'regions/europe-west1/subnetworks/test') p = beam.Pipeline(options=p_options) lines = p | 'receive_data' >> beam.io.ReadFromPubSub( subscription=known_args.in_topic).with_input_types(str) \ | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \ | 'jsonload' >> beam.Map(lambda x: json.loads(x)) # ----- window fixe + Trigger AfterWatermark + Accumulating mode ------ # (lines | 'window' >> beam.WindowInto( window.FixedWindows(10), trigger=trigger.AfterProcessingTime(30), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'CountGlobally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'printnbrarticles' >> beam.ParDo(PrintFn()) | 'jsondumps' >> beam.Map(lambda x: json.dumps(x)) | 'encode' >> beam.Map(lambda x: x.encode('utf-8')) | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic)) p.run().wait_until_finish()
def _maybe_apply_user_trigger(self, destination_file_kv_pc): if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs return (destination_file_kv_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) else: return destination_file_kv_pc
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = (destination_data_kv_pc | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def load(events, metadata=None): return ( events | nexmark_query_util.JustBids() | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder) # windowing with processing time trigger, currently not supported in batch | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterProcessingTime(metadata.get('window_size_sec'))), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=0) | 'query12_bid_count' >> beam.combiners.Count.PerElement() | 'query12_output' >> beam.Map( lambda t: { ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1] }))
def test_gbk_execution_after_processing_trigger_fired(self): """Advance TestClock to (X + delta) and see the pipeline does finish.""" # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired # Advance TestClock to (X + delta) and see the pipeline does finish # Possibly to the framework trigger_transcripts.yaml test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_processing_time(5.1)) # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def fired_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto( beam.window.FixedWindows(15), trigger=trigger.AfterProcessingTime(5), accumulation_mode=trigger.AccumulationMode.DISCARDING ) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey() | beam.Map(fired_elements)) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed. assert_that(records, equal_to([ ('k', ['a'])])) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('k', ['a'])], result)
def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. if self.is_streaming_pipeline: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def run(argv=None): class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument( '--input', default='projects/notbanana-7f869/topics/rsvps_source') parser.add_argument( '--output', default='projects/notbanana-7f869/topics/rsvps_out') options = PipelineOptions(flags=argv) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'notbanana-7f869' google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging' google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp' google_cloud_options.job_name = 'demo-job' """ -> Run the pipeline on the Cloud Dataflow runner. $ python pipelines/main.py --setup_file path/to/setup.py """ # options.view_as(StandardOptions).runner = 'DataflowRunner' with beam.Pipeline(options=options) as p: my_options = options.view_as(MyOptions) input_topic = my_options.input output_topic = my_options.output """ -> Consumes/collects events sent by the input Pub/Sub topic. @: id_label argument is a unique identifier used by the pipeline to deduplicate events : Exactly-once semantic. """ inputs = \ (p | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub( topic=input_topic, # id_label='event_id' ).with_output_types(six.binary_type) | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8')) | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element)) | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn())) """ -> Outputs the total number of events globally processed by the pipeline. Triggering early results from the window every X seconds (processing time trigger) or triggering when the current pane has collected at least N elements (data-driven trigger) Values used are for testing purposes. """ (inputs | 'Apply Global Window' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(2), # AfterProcessingTime is experimental. # Not implemented yet. trigger.AfterProcessingTime(30))), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Count events globally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'Publish %s' % 'Events' >> WriteToPubSub( topic=output_topic, category=Category.GLOBAL_EVENTS)) """ -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. Values used are for testing purposes. NB: Using a custom TopFn that will deduplicate k/v pairs when using an accumulation strategy: SO - 56616576 @guillem-xercavins """ (inputs | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto( beam.window.FixedWindows(size=10 * 60), trigger=trigger.Repeatedly(trigger.AfterCount(5)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | beam.Map(lambda element: element['group']) | beam.ParDo(PairTopicWithOneFn()) | beam.CombinePerKey(sum) | 'Top 10 Topics' >> beam.CombineGlobally( TopDistinctFn( n=10, compare=lambda a, b: a[1] < b[1])).without_defaults() | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn()) | 'Publish %s' % 'Topics' >> WriteToPubSub( topic=output_topic, category=Category.HOT_TOPICS))
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()