def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic',
                        dest='topic',
                        default=default_topic)
    parser.add_argument('--bucket',
                        dest='bucket',
                        default=default_bucket)

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job'])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    def add_key(element):
        # makes the string a dict and returns the key and the rest of the dict
        parsed_dict = yaml.load(element)
        key = parsed_dict[GROUP_BY_KEY]
        parsed_dict.pop(GROUP_BY_KEY)
        return (key, parsed_dict)

    def avg_value(element):
        # element[0] is the key, element[1] rest of the dictionary
        df = pd.DataFrame(element[1])
        avg = df[VALUE_TO_AVG].mean()
        return (element[0], avg)

    with beam.Pipeline(options=pipeline_options) as p:
        (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic)
         | "AddKey" >> beam.Map(lambda element: add_key(element))
         | "Window" >> beam.WindowInto(window.SlidingWindows(size=WINDOW_LENGTH, period=WINDOW_PERIOD))
         | "GroupByKey" >> beam.GroupByKey()
         | "AvgValue" >> beam.ParDo(avg_value)
         | "WriteToGCS" >> WriteToText(known_args.bucket))
Пример #2
0
def run():

# Command line arguments
  parser = argparse.ArgumentParser(description='Demonstrate side inputs')
  parser.add_argument('--bucket', required=True, help='Specify Cloud Storage bucket for output')
  parser.add_argument('--project',required=True, help='Specify Google Cloud project')
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument('--DirectRunner',action='store_true')
  group.add_argument('--DataFlowRunner',action='store_true')
      
  opts = parser.parse_args()

  if opts.DirectRunner:
    runner='DirectRunner'
  if opts.DataFlowRunner:
    runner='DataFlowRunner'

  bucket = opts.bucket
  project = opts.project

  argv = [
    '--project={0}'.format(project),
    '--job_name=streamingjob',
    '--save_main_session',
    '--staging_location=gs://{0}/staging/'.format(bucket),
    '--temp_location=gs://{0}/staging/'.format(bucket),
    '--runner={0}'.format(runner)
    ]

  pubsubinput='projects/{0}/topics/streamdemo'.format(project)
  pubsubschema = 'datetime:TIMESTAMP, num_words:INTEGER'
  
  p = beam.Pipeline(argv=argv)
  (p 
        | 'ReadFromPubSub' >> beam.io.ReadStringsFromPubSub( pubsubinput )
        | 'WordsPerMessage' >> beam.Map(lambda msg: (msg,countwords(msg)) )
#       | 'timestamp' >> beam.ParDo(AddTimestampDoFn())          # Tried adding an istantaneous timestamp
#       | 'window' >> beam.WindowInto(window.FixedWindows(60))   # Tried fixed window
        | 'Window' >> beam.WindowInto(window.SlidingWindows(1, 15))     
        | 'WordsInTimeWindow' >> beam.Map(lambda words: {'datetime': int(datetime.datetime.now().strftime('%s')),'num_words':sum(words)}) 
        | 'Combine' >> beam.CombineGlobally(sum).without_defaults()
        | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(    
          project=project, dataset='demos',table='streamdemo',
          schema=pubsubschema,
          write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
          create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
  ) 

  if runner == 'DataFlowRunner':
     p.run()
  else:
     p.run().wait_until_finish()
  logging.getLogger().setLevel(logging.INFO)
def SportTrackerMotivation(input, shortDuration, longDuration):

    boxed = input | "ComputeMetrics" >> ComputeBoxedMetrics(shortDuration)
    shortAverage = (
        boxed
        | "shortWindow" >> beam.WindowInto(window.FixedWindows(shortDuration))
        | "shortAverage" >> CalculateAveragePace())
    longAverage = (
        boxed
        | "longWindow" >> beam.WindowInto(
            window.SlidingWindows(longDuration, shortDuration))
        | "longAverage" >> CalculateAveragePace()
        |
        "longIntoFixed" >> beam.WindowInto(window.FixedWindows(shortDuration)))
    return ((shortAverage, longAverage)
            | beam.CoGroupByKey()
            | beam.FlatMap(asMotivation))
Пример #4
0
 def test_setting_sliding_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 16, 23])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_sliding_windows]
     from apache_beam import window
     sliding_windowed_items = (
         items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
     # [END setting_sliding_windows]
     summed = (sliding_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed,
                      beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
     p.run()
Пример #5
0
###################################################################################

pubsub_data = (
    p
    | 'Read from pub sub' >>
    beam.io.ReadFromPubSub(subscription=input_subscription)
    # STR_2,Maine,PR_265,Cosmetics,8,39,66,1553578219/r/n
    | 'Remove extra chars' >> beam.Map(lambda data: (data.rstrip().lstrip(
    )))  #STR_2,Maine,PR_265,Cosmetics,8,39,66,1553578219
    | 'Split Row' >> beam.Map(lambda row: row.split(
        ','))  #[STR_2,Maine,PR_265,Cosmetics,8,39,66,155358219]
    | 'Filter By state' >> beam.Filter(
        lambda elements: (elements[1] == "Maine" or elements[1] == "Texas"))
    | 'Create Profit Column' >> beam.Map(
        calculateProfit
    )  #[STR_2,Maine,PR_265,Cosmetics,8,39,66,1553578219,216]
    | 'Form Key Value pair' >> beam.Map(
        lambda elements: (elements[0], int(elements[7])))  #STR_2 216 
    | 'Window' >> beam.WindowInto(window.SlidingWindows(30, 10))
    | 'Sum values' >> beam.CombinePerKey(sum)  # STR_2 , []
    | 'Encode to byte string' >> beam.Map(encode_byte_string)
    | 'Write to pus sub' >> beam.io.WriteToPubSub(output_topic)

    #              p
    #               | 'Read from pub sub' >> #beam.io.ReadFromPubSub(subscription=input_subscription)
    #               | 'Write to pub sub' >> beam.io.WriteToPubSub(output_topic)
)

result = p.run()
result.wait_until_finish()
Пример #6
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--gcp_project',
                        required=True,
                        default='',
                        help='GCP Project ID')
    parser.add_argument('--region',
                        required=True,
                        default='',
                        help='GCP Project ID')
    parser.add_argument('--job_name',
                        required=True,
                        default='',
                        help='Dataflow Job Name')
    parser.add_argument('--gcp_staging_location',
                        required=True,
                        default='gs://xxxxx/staging',
                        help='Dataflow Staging GCS location')
    parser.add_argument('--gcp_tmp_location',
                        required=True,
                        default='gs://xxxxx/tmp',
                        help='Dataflow tmp GCS location')
    parser.add_argument('--batch_size',
                        required=True,
                        default=10,
                        help='Dataflow Batch Size')
    parser.add_argument(
        '--input_topic',
        required=True,
        default='',
        help='Input PubSub Topic: projects/<project_id>/topics/<topic_name>')
    parser.add_argument('--bq_dataset_name',
                        required=True,
                        default='',
                        help='Output BigQuery Dataset')
    parser.add_argument('--bq_table_name',
                        required=True,
                        default='',
                        help='Output BigQuery Table')
    parser.add_argument(
        '--runner',
        required=True,
        default='DirectRunner',
        help='Dataflow Runner - DataflowRunner or DirectRunner (local)')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args.extend([
        '--runner={}'.format(
            known_args.runner),  # DataflowRunner or DirectRunner (local)
        '--project={}'.format(known_args.gcp_project),
        '--staging_location={}'.format(known_args.gcp_staging_location
                                       ),  # Google Cloud Storage gs:// path
        '--temp_location={}'.format(
            known_args.gcp_tmp_location),  # Google Cloud Storage gs:// path
        '--job_name=' + str(known_args.job_name),
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    ###################################################################
    #   DataFlow Pipeline
    ###################################################################

    with beam.Pipeline(options=pipeline_options) as p:

        logging.info('Ready to process events from PubSub topic: {}'.format(
            known_args.input_topic))

        # Read the pubsub topic into a PCollection.
        events = (p | beam.io.ReadFromPubSub(known_args.input_topic))

        # Parse events
        parsed = (events | beam.Map(parse_pubsub))

        # Tranform events
        transformed = (
            parsed | beam.Map(extract_map_type)
            | beam.Map(lambda x: (x, 1))
            | beam.WindowInto(
                window.SlidingWindows(30, 5)
            )  # Window is 30 seconds in length, and a new window begins every five seconds
            | beam.GroupByKey()
            | beam.Map(sum_by_group))

        # Print results to console (for testing/debugging)
        transformed | 'Print aggregated game logs' >> beam.Map(print)

        # Sink/Persist to BigQuery
        parsed | 'Write to bq' >> beam.io.gcp.bigquery.WriteToBigQuery(
            table=known_args.bq_table_name,
            dataset=known_args.bq_dataset_name,
            project=known_args.gcp_project,
            schema=bq_schema,
            batch_size=int(known_args.batch_size))
Пример #7
0
def run(argv=None):
    global cnt
    cnt = 0
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic_read',
                        type=str,
                        help='Pub/Sub topic to read from')
    parser.add_argument(
        '--table1',
        required=True,
        help=
        ('Output BigQuery table1 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table2',
        required=True,
        help=
        ('Output BigQuery table2 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table3',
        required=True,
        help=
        ('Output BigQuery table3 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table4',
        required=True,
        help=
        ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table5',
        required=True,
        help=
        ('Output BigQuery table3 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table6',
        required=True,
        help=
        ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table7',
        required=True,
        help=
        ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))
    parser.add_argument(
        '--table8',
        required=True,
        help=
        ('Output BigQuery table4 for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
         ))

    #parser.add_argument('--output')
    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=options)
    print(args.table1, args.table2, args.table3, args.table4)
    data = (p | 'Read from PubSub' >> beam.io.ReadFromPubSub(
        topic=args.topic_read).with_output_types(bytes)
            | 'Create a tuple with thingID as the key' >> beam.Map(lambda x: (
                (json.loads(x)['thingID'],
                 (json.loads(x)['volumeDiff'], json.loads(x)['volume'])))))
    hourly = (
        data |
        'Window into sliding window of an hour 1 with a new window every 15 minutes'
        >> beam.WindowInto(window.SlidingWindows(3600, 900))
        | 'group by key 1 1' >> beam.GroupByKey())
    hourly_decr = (
        hourly
        | 'calculate hourly avg decrease rate' >> beam.ParDo(avgDecr4Window()))
    write_hourly_decr_2_bigquery = (
        hourly_decr
        | 'write to Hourly_Average_Decrease table' >> beam.io.WriteToBigQuery(
            args.table1,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    daily_decr = (
        hourly_decr | 'convert to tuple to agg based on key for daily decr' >>
        beam.ParDo(converttotuple())
        |
        'window into sliding windows of 24 hours with a new window every 3 hours 1'
        >> beam.WindowInto(window.SlidingWindows(3600 * 24, 3600 * 3))
        | 'group by key for daily decr' >> beam.GroupByKey()
        | 'calculate  daily decr change rate' >> beam.ParDo(avgDecr4Window()))

    write_daily_decr_2_bigquery = (
        daily_decr
        | 'write to Daily_Average_Decrease table' >> beam.io.WriteToBigQuery(
            args.table3,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    hourly_avg = (
        hourly
        | 'calculate hourly avg change rate' >> beam.ParDo(avg4Window()))
    write_hourly_avg_2_bigquery = (
        hourly_avg
        | 'write to Hourly_Averages table' >> beam.io.WriteToBigQuery(
            args.table2,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    daily_avg = (
        hourly_avg | 'convert to tuple for agg based on key for daily avg' >>
        beam.ParDo(converttotuple())
        |
        'window into sliding windows of 24 hours with a new window every 3 hours 2'
        >> beam.WindowInto(window.SlidingWindows(3600 * 24, 3600 * 3))
        | 'group by key for daily avg' >> beam.GroupByKey()
        | 'calculate daily avg change rate' >> beam.ParDo(avg4Window()))
    write_daily_avg_2_bigquery = (
        daily_avg | 'write to Daily_Averages table' >> beam.io.WriteToBigQuery(
            args.table4,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    monthly_avg = (
        daily_avg | 'convert to tuple to agg based on key for monthly avg' >>
        beam.ParDo(converttotuple())
        |
        'window into sliding windows of 30 days with a new window every 7.5 days 2'
        >> beam.WindowInto(
            window.SlidingWindows(3600 * 24 * 30, 3600 * 24 * 7.5))
        | 'group by key for monthly avg' >> beam.GroupByKey()
        | 'calculate monthly avg change rate' >> beam.ParDo(avg4Window()))
    write_monthly_avg_2_bigquery = (
        monthly_avg
        | 'Write to Monthly Averages table' >> beam.io.WriteToBigQuery(
            args.table6,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    yearly_avg = (
        monthly_avg | 'convert to tuple to agg based on key for yearly avg' >>
        beam.ParDo(converttotuple())
        |
        'window into sliding windows of a year with a new window every 30 days 2'
        >> beam.WindowInto(
            window.SlidingWindows(3600 * 365 * 24, 3600 * 24 * 30))
        | 'group by key for yearly avg' >> beam.GroupByKey()
        | 'calculate yearly avg change rate' >> beam.ParDo(avg4Window()))
    write_yearly_avg_2_bigquery = (
        yearly_avg
        | 'Write to yearly Average table' >> beam.io.WriteToBigQuery(
            args.table8,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    monthly_decr = (
        daily_decr | 'convert to tuple to agg based on key for monthly decr' >>
        beam.ParDo(converttotuple())
        |
        'window into sliding windows of 30 days with a new window every 7.5 days 1'
        >> beam.WindowInto(
            window.SlidingWindows(3600 * 24 * 30, 3600 * 24 * 7.5))
        | 'group by key for monthly decr' >> beam.GroupByKey()
        | 'calculate monthly decr change rate' >> beam.ParDo(avgDecr4Window()))
    write_monthly_decr_2_bigquery = (
        monthly_decr
        | 'Write to Monthly Average Decrease table' >> beam.io.WriteToBigQuery(
            args.table5,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
    yearly_decr = (
        monthly_decr | 'convert to tuple to agg based on key for yearly decr'
        >> beam.ParDo(converttotuple())
        |
        'window into sliding windows of a year with a new window every 30 days 1'
        >> beam.WindowInto(
            window.SlidingWindows(3600 * 365 * 24, 3600 * 24 * 30))
        | 'group by key for yaerly decr' >> beam.GroupByKey()
        | 'calculate yearly decr change rate' >> beam.ParDo(avgDecr4Window()))
    write_yearly_decr_2_bigquery = (
        yearly_decr
        | 'Write to yearly Average Decrease table' >> beam.io.WriteToBigQuery(
            args.table7,
            schema=' thingID: STRING, time: DATETIME, average: FLOAT',
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    result = p.run()
    result.wait_until_finish()  #for streaming pipeline
from datetime import datetime

file_in = 'tags.csv'
skip_head = "userId,movieId,tag,timestamp"


class ParseNewMovies(beam.DoFn):
    def process(self,element):
        if(element!= skip_head):
            z = element.split(",")
            y=int(z[3])
            i = datetime.utcfromtimestamp(y)
            x = i.strftime('%Y-%m-%d %H:%M:%S')
            yield z[2],(z[1],x)

with beam.Pipeline() as pipeline:
    item = (

            pipeline
            | 'Read lines' >> beam.io.ReadFromText(file_in)
            | 'Par D1' >> beam.ParDo(ParseNewMovies())


    )
    x = (
        item | 'Par D3' >>  beam.WindowInto(window.SlidingWindows(30, 20))
             | 'Par D2' >>  beam.combiners.Count.PerKey()
             | 'Par D4' >>  beam.Map(print)
    )