def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--topic', dest='topic', default=default_topic) parser.add_argument('--bucket', dest='bucket', default=default_bucket) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job']) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True def add_key(element): # makes the string a dict and returns the key and the rest of the dict parsed_dict = yaml.load(element) key = parsed_dict[GROUP_BY_KEY] parsed_dict.pop(GROUP_BY_KEY) return (key, parsed_dict) def avg_value(element): # element[0] is the key, element[1] rest of the dictionary df = pd.DataFrame(element[1]) avg = df[VALUE_TO_AVG].mean() return (element[0], avg) with beam.Pipeline(options=pipeline_options) as p: (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic) | "AddKey" >> beam.Map(lambda element: add_key(element)) | "Window" >> beam.WindowInto(window.SlidingWindows(size=WINDOW_LENGTH, period=WINDOW_PERIOD)) | "GroupByKey" >> beam.GroupByKey() | "AvgValue" >> beam.ParDo(avg_value) | "WriteToGCS" >> WriteToText(known_args.bucket))
def run(): # Command line arguments parser = argparse.ArgumentParser(description='Demonstrate side inputs') parser.add_argument('--bucket', required=True, help='Specify Cloud Storage bucket for output') parser.add_argument('--project',required=True, help='Specify Google Cloud project') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--DirectRunner',action='store_true') group.add_argument('--DataFlowRunner',action='store_true') opts = parser.parse_args() if opts.DirectRunner: runner='DirectRunner' if opts.DataFlowRunner: runner='DataFlowRunner' bucket = opts.bucket project = opts.project argv = [ '--project={0}'.format(project), '--job_name=streamingjob', '--save_main_session', '--staging_location=gs://{0}/staging/'.format(bucket), '--temp_location=gs://{0}/staging/'.format(bucket), '--runner={0}'.format(runner) ] pubsubinput='projects/{0}/topics/streamdemo'.format(project) pubsubschema = 'datetime:TIMESTAMP, num_words:INTEGER' p = beam.Pipeline(argv=argv) (p | 'ReadFromPubSub' >> beam.io.ReadStringsFromPubSub( pubsubinput ) | 'WordsPerMessage' >> beam.Map(lambda msg: (msg,countwords(msg)) ) # | 'timestamp' >> beam.ParDo(AddTimestampDoFn()) # Tried adding an istantaneous timestamp # | 'window' >> beam.WindowInto(window.FixedWindows(60)) # Tried fixed window | 'Window' >> beam.WindowInto(window.SlidingWindows(1, 15)) | 'WordsInTimeWindow' >> beam.Map(lambda words: {'datetime': int(datetime.datetime.now().strftime('%s')),'num_words':sum(words)}) | 'Combine' >> beam.CombineGlobally(sum).without_defaults() | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( project=project, dataset='demos',table='streamdemo', schema=pubsubschema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) ) if runner == 'DataFlowRunner': p.run() else: p.run().wait_until_finish() logging.getLogger().setLevel(logging.INFO)
def SportTrackerMotivation(input, shortDuration, longDuration): boxed = input | "ComputeMetrics" >> ComputeBoxedMetrics(shortDuration) shortAverage = ( boxed | "shortWindow" >> beam.WindowInto(window.FixedWindows(shortDuration)) | "shortAverage" >> CalculateAveragePace()) longAverage = ( boxed | "longWindow" >> beam.WindowInto( window.SlidingWindows(longDuration, shortDuration)) | "longAverage" >> CalculateAveragePace() | "longIntoFixed" >> beam.WindowInto(window.FixedWindows(shortDuration))) return ((shortAverage, longAverage) | beam.CoGroupByKey() | beam.FlatMap(asMotivation))
def test_setting_sliding_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 16, 23]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_sliding_windows] from apache_beam import window sliding_windowed_items = ( items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5))) # [END setting_sliding_windows] summed = (sliding_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41])) p.run()
################################################################################### pubsub_data = ( p | 'Read from pub sub' >> beam.io.ReadFromPubSub(subscription=input_subscription) # STR_2,Maine,PR_265,Cosmetics,8,39,66,1553578219/r/n | 'Remove extra chars' >> beam.Map(lambda data: (data.rstrip().lstrip( ))) #STR_2,Maine,PR_265,Cosmetics,8,39,66,1553578219 | 'Split Row' >> beam.Map(lambda row: row.split( ',')) #[STR_2,Maine,PR_265,Cosmetics,8,39,66,155358219] | 'Filter By state' >> beam.Filter( lambda elements: (elements[1] == "Maine" or elements[1] == "Texas")) | 'Create Profit Column' >> beam.Map( calculateProfit ) #[STR_2,Maine,PR_265,Cosmetics,8,39,66,1553578219,216] | 'Form Key Value pair' >> beam.Map( lambda elements: (elements[0], int(elements[7]))) #STR_2 216 | 'Window' >> beam.WindowInto(window.SlidingWindows(30, 10)) | 'Sum values' >> beam.CombinePerKey(sum) # STR_2 , [] | 'Encode to byte string' >> beam.Map(encode_byte_string) | 'Write to pus sub' >> beam.io.WriteToPubSub(output_topic) # p # | 'Read from pub sub' >> #beam.io.ReadFromPubSub(subscription=input_subscription) # | 'Write to pub sub' >> beam.io.WriteToPubSub(output_topic) ) result = p.run() result.wait_until_finish()
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--gcp_project', required=True, default='', help='GCP Project ID') parser.add_argument('--region', required=True, default='', help='GCP Project ID') parser.add_argument('--job_name', required=True, default='', help='Dataflow Job Name') parser.add_argument('--gcp_staging_location', required=True, default='gs://xxxxx/staging', help='Dataflow Staging GCS location') parser.add_argument('--gcp_tmp_location', required=True, default='gs://xxxxx/tmp', help='Dataflow tmp GCS location') parser.add_argument('--batch_size', required=True, default=10, help='Dataflow Batch Size') parser.add_argument( '--input_topic', required=True, default='', help='Input PubSub Topic: projects/<project_id>/topics/<topic_name>') parser.add_argument('--bq_dataset_name', required=True, default='', help='Output BigQuery Dataset') parser.add_argument('--bq_table_name', required=True, default='', help='Output BigQuery Table') parser.add_argument( '--runner', required=True, default='DirectRunner', help='Dataflow Runner - DataflowRunner or DirectRunner (local)') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner={}'.format( known_args.runner), # DataflowRunner or DirectRunner (local) '--project={}'.format(known_args.gcp_project), '--staging_location={}'.format(known_args.gcp_staging_location ), # Google Cloud Storage gs:// path '--temp_location={}'.format( known_args.gcp_tmp_location), # Google Cloud Storage gs:// path '--job_name=' + str(known_args.job_name), ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True ################################################################### # DataFlow Pipeline ################################################################### with beam.Pipeline(options=pipeline_options) as p: logging.info('Ready to process events from PubSub topic: {}'.format( known_args.input_topic)) # Read the pubsub topic into a PCollection. events = (p | beam.io.ReadFromPubSub(known_args.input_topic)) # Parse events parsed = (events | beam.Map(parse_pubsub)) # Tranform events transformed = ( parsed | beam.Map(extract_map_type) | beam.Map(lambda x: (x, 1)) | beam.WindowInto( window.SlidingWindows(30, 5) ) # Window is 30 seconds in length, and a new window begins every five seconds | beam.GroupByKey() | beam.Map(sum_by_group)) # Print results to console (for testing/debugging) transformed | 'Print aggregated game logs' >> beam.Map(print) # Sink/Persist to BigQuery parsed | 'Write to bq' >> beam.io.gcp.bigquery.WriteToBigQuery( table=known_args.bq_table_name, dataset=known_args.bq_dataset_name, project=known_args.gcp_project, schema=bq_schema, batch_size=int(known_args.batch_size))
def run(argv=None): global cnt cnt = 0 parser = argparse.ArgumentParser() parser.add_argument('--topic_read', type=str, help='Pub/Sub topic to read from') parser.add_argument( '--table1', required=True, help= ('Output BigQuery table1 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table2', required=True, help= ('Output BigQuery table2 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table3', required=True, help= ('Output BigQuery table3 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table4', required=True, help= ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table5', required=True, help= ('Output BigQuery table3 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table6', required=True, help= ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table7', required=True, help= ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) parser.add_argument( '--table8', required=True, help= ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' )) #parser.add_argument('--output') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) print(args.table1, args.table2, args.table3, args.table4) data = (p | 'Read from PubSub' >> beam.io.ReadFromPubSub( topic=args.topic_read).with_output_types(bytes) | 'Create a tuple with thingID as the key' >> beam.Map(lambda x: ( (json.loads(x)['thingID'], (json.loads(x)['volumeDiff'], json.loads(x)['volume']))))) hourly = ( data | 'Window into sliding window of an hour 1 with a new window every 15 minutes' >> beam.WindowInto(window.SlidingWindows(3600, 900)) | 'group by key 1 1' >> beam.GroupByKey()) hourly_decr = ( hourly | 'calculate hourly avg decrease rate' >> beam.ParDo(avgDecr4Window())) write_hourly_decr_2_bigquery = ( hourly_decr | 'write to Hourly_Average_Decrease table' >> beam.io.WriteToBigQuery( args.table1, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) daily_decr = ( hourly_decr | 'convert to tuple to agg based on key for daily decr' >> beam.ParDo(converttotuple()) | 'window into sliding windows of 24 hours with a new window every 3 hours 1' >> beam.WindowInto(window.SlidingWindows(3600 * 24, 3600 * 3)) | 'group by key for daily decr' >> beam.GroupByKey() | 'calculate daily decr change rate' >> beam.ParDo(avgDecr4Window())) write_daily_decr_2_bigquery = ( daily_decr | 'write to Daily_Average_Decrease table' >> beam.io.WriteToBigQuery( args.table3, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) hourly_avg = ( hourly | 'calculate hourly avg change rate' >> beam.ParDo(avg4Window())) write_hourly_avg_2_bigquery = ( hourly_avg | 'write to Hourly_Averages table' >> beam.io.WriteToBigQuery( args.table2, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) daily_avg = ( hourly_avg | 'convert to tuple for agg based on key for daily avg' >> beam.ParDo(converttotuple()) | 'window into sliding windows of 24 hours with a new window every 3 hours 2' >> beam.WindowInto(window.SlidingWindows(3600 * 24, 3600 * 3)) | 'group by key for daily avg' >> beam.GroupByKey() | 'calculate daily avg change rate' >> beam.ParDo(avg4Window())) write_daily_avg_2_bigquery = ( daily_avg | 'write to Daily_Averages table' >> beam.io.WriteToBigQuery( args.table4, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) monthly_avg = ( daily_avg | 'convert to tuple to agg based on key for monthly avg' >> beam.ParDo(converttotuple()) | 'window into sliding windows of 30 days with a new window every 7.5 days 2' >> beam.WindowInto( window.SlidingWindows(3600 * 24 * 30, 3600 * 24 * 7.5)) | 'group by key for monthly avg' >> beam.GroupByKey() | 'calculate monthly avg change rate' >> beam.ParDo(avg4Window())) write_monthly_avg_2_bigquery = ( monthly_avg | 'Write to Monthly Averages table' >> beam.io.WriteToBigQuery( args.table6, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) yearly_avg = ( monthly_avg | 'convert to tuple to agg based on key for yearly avg' >> beam.ParDo(converttotuple()) | 'window into sliding windows of a year with a new window every 30 days 2' >> beam.WindowInto( window.SlidingWindows(3600 * 365 * 24, 3600 * 24 * 30)) | 'group by key for yearly avg' >> beam.GroupByKey() | 'calculate yearly avg change rate' >> beam.ParDo(avg4Window())) write_yearly_avg_2_bigquery = ( yearly_avg | 'Write to yearly Average table' >> beam.io.WriteToBigQuery( args.table8, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) monthly_decr = ( daily_decr | 'convert to tuple to agg based on key for monthly decr' >> beam.ParDo(converttotuple()) | 'window into sliding windows of 30 days with a new window every 7.5 days 1' >> beam.WindowInto( window.SlidingWindows(3600 * 24 * 30, 3600 * 24 * 7.5)) | 'group by key for monthly decr' >> beam.GroupByKey() | 'calculate monthly decr change rate' >> beam.ParDo(avgDecr4Window())) write_monthly_decr_2_bigquery = ( monthly_decr | 'Write to Monthly Average Decrease table' >> beam.io.WriteToBigQuery( args.table5, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) yearly_decr = ( monthly_decr | 'convert to tuple to agg based on key for yearly decr' >> beam.ParDo(converttotuple()) | 'window into sliding windows of a year with a new window every 30 days 1' >> beam.WindowInto( window.SlidingWindows(3600 * 365 * 24, 3600 * 24 * 30)) | 'group by key for yaerly decr' >> beam.GroupByKey() | 'calculate yearly decr change rate' >> beam.ParDo(avgDecr4Window())) write_yearly_decr_2_bigquery = ( yearly_decr | 'Write to yearly Average Decrease table' >> beam.io.WriteToBigQuery( args.table7, schema=' thingID: STRING, time: DATETIME, average: FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) result = p.run() result.wait_until_finish() #for streaming pipeline
from datetime import datetime file_in = 'tags.csv' skip_head = "userId,movieId,tag,timestamp" class ParseNewMovies(beam.DoFn): def process(self,element): if(element!= skip_head): z = element.split(",") y=int(z[3]) i = datetime.utcfromtimestamp(y) x = i.strftime('%Y-%m-%d %H:%M:%S') yield z[2],(z[1],x) with beam.Pipeline() as pipeline: item = ( pipeline | 'Read lines' >> beam.io.ReadFromText(file_in) | 'Par D1' >> beam.ParDo(ParseNewMovies()) ) x = ( item | 'Par D3' >> beam.WindowInto(window.SlidingWindows(30, 20)) | 'Par D2' >> beam.combiners.Count.PerKey() | 'Par D4' >> beam.Map(print) )