def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from Json into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') opts, pipeline_opts = parser.parse_known_args() # Setting up the Beam pipeline options options = PipelineOptions(pipeline_opts, save_main_session=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'performance-demo-', time.time_ns()) options.view_as(StandardOptions).runner = 'DataflowRunner' agg_output_path = f"{opts.project}/agg_output/" raw_output_path = f"{opts.project}/raw_output/" query = """ SELECT user_id, COUNT(*) AS page_views, SUM(num_bytes) as total_bytes, MAX(num_bytes) AS max_bytes, MIN(num_bytes) as min_bytes FROM PCOLLECTION GROUP BY user_id """ # Create the pipeline p = beam.Pipeline(options=options) logs = (p | 'ReadFromGCS' >> beam.io.ReadFromText('./users.csv') | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)) (logs | 'RawToDict' >> beam.Map(lambda row: row._asdict()) #| 'WriteRawToText' >> beam.io.WriteToText(raw_output_path) ) (logs | 'PerUserAggregations' >> SqlTransform(query, dialect='zetasql') | 'AggToDict' >> beam.Map(lambda row: row._asdict()) #| 'WriteAggToText' >> beam.io.WriteToText(agg_output_path) ) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
def apply_sql(query: str, output_name: Optional[str], found: Dict[str, beam.PCollection]) -> Tuple[str, PValue]: """Applies a SqlTransform with the given sql and queried PCollections. Args: query: The SQL query executed in the magic. output_name: (optional) The output variable name in __main__ module. found: The PCollections with variable names found to be used in the query. Returns: A Tuple[str, PValue]. First str value is the output variable name in __main__ module (auto-generated if not provided). Second PValue is most likely a PCollection, depending on the query. """ output_name = _generate_output_name(output_name, query, found) query, sql_source = _build_query_components(query, found) try: output = sql_source | SqlTransform(query) # Declare a variable with the output_name and output value in the # __main__ module so that the user can use the output smoothly. setattr(importlib.import_module('__main__'), output_name, output) ib.watch({output_name: output}) _LOGGER.info( "The output PCollection variable is %s with element_type %s", output_name, pformat_namedtuple(output.element_type)) return output_name, output except (KeyboardInterrupt, SystemExit): raise except Exception as e: on_error('Error when applying the Beam SQL: %s', e)
def main(): logging.basicConfig(level=logging.INFO, format='%(levelname)-8s: %(message)s') parser = argparse.ArgumentParser(description=__doc__.strip()) parser.add_argument('filename', help='Beancount ledger filename') args, pipeline_args = parser.parse_known_args() # Read the ledger. logging.info("Reading ledger.") t1 = time.time() entries, errors, options_map = loader.load_file(args.filename) postings = (beam.Row(posting.account, posting.units.number, posting.units.currency) for entry in data.filter_txns(entries) for posting in entry.postings) price_map = prices.build_price_map(entries) t2 = time.time() logging.info("Read ledger in %.1fsecs.", t2 - t1) with CreatePipeline(pipeline_args) as pipeline: _ = (pipeline | beam.Create(postings) | SqlTransform(""" SELECT account FROM PCOLLECTION """, dialect="zetasql") | beam.Map(print))
def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline: """Converts the chain into an executable pipeline.""" if pipeline not in self.evaluated: # The whole chain should form a single pipeline. source = self.source if isinstance(self.source, beam.Pipeline): if pipeline: # use the known pipeline source = pipeline else: # use the source pipeline pipeline = self.source else: name_to_pcoll = pcoll_by_name() if len(self.source) == 1: source = name_to_pcoll.get(next(iter(self.source))) else: source = {s: name_to_pcoll.get(s) for s in self.source} if isinstance(source, beam.Pipeline): output = source | 'beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SqlTransform( self.query) else: output = source | 'schema_loaded_beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SchemaLoadedSqlTransform( self.output_name, self.query, self.schemas, self.execution_count) _ = create_var_in_main(self.output_name, output) self.evaluated.add(pipeline) if self.next: return self.next.to_pipeline(pipeline) else: return pipeline
def test_generate_data(self): with TestPipeline() as p: out = p | SqlTransform("""SELECT CAST(1 AS INT) AS `id`, CAST('foo' AS VARCHAR) AS `str`, CAST(3.14 AS DOUBLE) AS `flt`""") assert_that(out, equal_to([(1, "foo", 3.14)]))
def test_filter(self): with TestPipeline() as p: out = ( p | beam.Create([SimpleRow(1, "foo", 3.14), SimpleRow(2, "bar", 1.414)]) | SqlTransform("SELECT * FROM PCOLLECTION WHERE `str` = 'bar'")) assert_that(out, equal_to([(2, "bar", 1.414)]))
def test_zetasql_generate_data(self): with TestPipeline() as p: out = p | SqlTransform("""SELECT CAST(1 AS INT64) AS `int`, CAST('foo' AS STRING) AS `str`, CAST(3.14 AS FLOAT64) AS `flt`""", dialect="zetasql") assert_that(out, equal_to([(1, "foo", 3.14)]))
def test_row(self): with TestPipeline() as p: out = (p | beam.Create([1, 2, 10]) | beam.Map(lambda x: beam.Row(a=x, b=str(x))) | SqlTransform( "SELECT a*a as s, LENGTH(b) AS c FROM PCOLLECTION")) assert_that(out, equal_to([(1, 1), (4, 1), (100, 2)]))
def test_sql(self): with self.create_pipeline() as p: output = ( p | 'Create' >> beam.Create([Row(x, str(x)) for x in range(5)]) | 'Sql' >> SqlTransform( """SELECT col1, col2 || '*' || col2 as col2, power(col1, 2) as col3 FROM PCOLLECTION """, expansion_service=self.get_expansion_service())) assert_that( output, equal_to([(x, '{x}*{x}'.format(x=x), x * x) for x in range(5)]))
def test_map(self): with TestPipeline() as p: out = ( p | beam.Create([ Shopper('bob', { 'bananas': 6, 'cherries': 3 }), Shopper('alice', { 'apples': 2, 'bananas': 3 }) ]).with_output_types(Shopper) | SqlTransform("SELECT * FROM PCOLLECTION WHERE shopper = 'alice'")) assert_that(out, equal_to([('alice', {'apples': 2, 'bananas': 3})]))
def test_windowing_before_sql(self): with TestPipeline() as p: out = ( p | beam.Create([ SimpleRow(5, "foo", 1.), SimpleRow(15, "bar", 2.), SimpleRow(25, "baz", 3.) ]) | beam.Map(lambda v: beam.window.TimestampedValue(v, v.id) ).with_output_types(SimpleRow) | beam.WindowInto( beam.window.FixedWindows(10)).with_output_types(SimpleRow) | SqlTransform("SELECT COUNT(*) as `count` FROM PCOLLECTION")) assert_that(out, equal_to([(1, ), (1, ), (1, )]))
def run(p, input_file, output_file): #pylint: disable=expression-not-assigned (p | 'read' >> ReadFromText(input_file) | 'split' >> beam.FlatMap(str.split) | 'row' >> beam.Map(MyRow).with_output_types(MyRow) | 'sql!!' >> SqlTransform(""" SELECT word as key, COUNT(*) as `count` FROM PCOLLECTION GROUP BY word""") | 'format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count)) | 'write' >> WriteToText(output_file)) result = p.run() result.wait_until_finish()
def run(p, input_file, output_file): #pylint: disable=expression-not-assigned (p # Read the lines from a text file. | 'Read' >> ReadFromText(input_file) # Split the line into individual words. | 'Split' >> beam.FlatMap(lambda line: re.split(r'\W+', line)) # Map each word to an instance of MyRow. | 'ToRow' >> beam.Map(MyRow).with_output_types(MyRow) # SqlTransform yields a PCollection containing elements with attributes # based on the output of the query. | 'Sql!!' >> SqlTransform(""" SELECT word as key, COUNT(*) as `count` FROM PCOLLECTION GROUP BY word""") | 'Format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count)) | 'Write' >> WriteToText(output_file))
def run(output_topic, pipeline_args): pipeline_options = PipelineOptions(pipeline_args, save_main_session=True, streaming=True) with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime', timestamp_attribute="ts").with_output_types(bytes) | "Parse JSON payload" >> beam.Map(json.loads) # Use beam.Row to create a schema-aware PCollection | "Create beam Row" >> beam.Map( lambda x: beam.Row(ride_status=str(x['ride_status']), passenger_count=int(x['passenger_count']))) # SqlTransform will computes result within an existing window | "15s fixed windows" >> beam.WindowInto( beam.window.FixedWindows(15)) # Aggregate drop offs and pick ups that occur within each 15s window | SqlTransform(""" SELECT ride_status, COUNT(*) AS num_rides, SUM(passenger_count) AS total_passengers FROM PCOLLECTION WHERE NOT ride_status = 'enroute' GROUP BY ride_status""") # SqlTransform yields python objects with attributes corresponding to # the outputs of the query. # Collect those attributes, as well as window information, into a dict | "Assemble Dictionary" >> beam.Map( lambda row, window=beam.DoFn.WindowParam: { "ride_status": row.ride_status, "num_rides": row.num_rides, "total_passengers": row.total_passengers, "window_start": window.start.to_rfc3339(), "window_end": window.end.to_rfc3339() }) | "Convert to JSON" >> beam.Map(json.dumps) | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8")) | beam.io.WriteToPubSub(topic=output_topic))
def apply_sql( query: str, output_name: Optional[str], found: Dict[str, beam.PCollection], run: bool = True) -> Tuple[str, Union[PValue, SqlNode], SqlChain]: """Applies a SqlTransform with the given sql and queried PCollections. Args: query: The SQL query executed in the magic. output_name: (optional) The output variable name in __main__ module. found: The PCollections with variable names found to be used in the query. run: Whether to prepare the SQL pipeline for a local run or not. Returns: A tuple of values. First str value is the output variable name in __main__ module, auto-generated if not provided. Second value: if run, it's a PValue; otherwise, a SqlNode tracks the SQL without applying it or executing it. Third value: SqlChain is a chain of SqlNodes that have been applied. """ output_name = _generate_output_name(output_name, query, found) query, sql_source, chain = _build_query_components( query, found, output_name, run) if run: try: output = sql_source | SqlTransform(query) # Declare a variable with the output_name and output value in the # __main__ module so that the user can use the output smoothly. output_name, output = create_var_in_main(output_name, output) _LOGGER.info( "The output PCollection variable is %s with element_type %s", output_name, pformat_namedtuple(output.element_type)) return output_name, output, chain except (KeyboardInterrupt, SystemExit): raise except: # pylint: disable=bare-except on_error('Error when applying the Beam SQL: %s', traceback.format_exc()) raise else: return output_name, chain.current, chain
def test_agg(self): with TestPipeline() as p: out = (p | beam.Create([ SimpleRow(1, "foo", 1.), SimpleRow(1, "foo", 2.), SimpleRow(1, "foo", 3.), SimpleRow(2, "bar", 1.414), SimpleRow(2, "bar", 1.414), SimpleRow(2, "bar", 1.414), SimpleRow(2, "bar", 1.414), ]) | SqlTransform(""" SELECT `str`, COUNT(*) AS `count`, SUM(`id`) AS `sum`, AVG(`flt`) AS `avg` FROM PCOLLECTION GROUP BY `str`""")) assert_that(out, equal_to([("foo", 3, 3, 2), ("bar", 4, 8, 1.414)]))
def test_tagged_join(self): with TestPipeline() as p: enrich = (p | "Create enrich" >> beam.Create( [Enrich(1, "a"), Enrich(2, "b"), Enrich(26, "z")])) simple = (p | "Create simple" >> beam.Create([ SimpleRow(1, "foo", 3.14), SimpleRow(26, "bar", 1.11), SimpleRow(1, "baz", 2.34) ])) out = ({ 'simple': simple, 'enrich': enrich } | SqlTransform(""" SELECT simple.`id` AS `id`, enrich.metadata AS metadata FROM simple JOIN enrich ON simple.`id` = enrich.`id`""")) assert_that(out, equal_to([(1, "a"), (26, "z"), (1, "a")]))
def expand(self, source): """Applies the SQL transform. If a PCollection uses a schema defined in the main session, use the additional DoFn to restore it on the worker.""" if isinstance(source, dict): schema_loaded = { tag: pcoll | 'load_schemas_{}_tag_{}_{}'.format( self.output_name, tag, self.execution_count) >> beam.ParDo( self._SqlTransformDoFn(self.schemas, self.schema_annotations)) if pcoll.element_type in self.schemas else pcoll for tag, pcoll in source.items() } elif isinstance(source, beam.pvalue.PCollection): schema_loaded = source | 'load_schemas_{}_{}'.format( self.output_name, self.execution_count) >> beam.ParDo( self._SqlTransformDoFn(self.schemas, self.schema_annotations) ) if source.element_type in self.schemas else source else: raise ValueError( '{} should be either a single PCollection or a dict of named ' 'PCollections.'.format(source)) return schema_loaded | 'beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SqlTransform(self.query)
import typing from apache_beam.portability.common_urns import coders from apache_beam.transforms.sql import SqlTransform from past.builtins import unicode Purchase = typing.NamedTuple('Purchase', [('item_name', unicode), ('price', float)]) coders.registry.register_coder(Purchase, coders.RowCoder) xx = Purchase | SqlTransform(""" SELECT item_name, COUNT(*) AS `count` FROM PCOLLECTION GROUP BY item_name""")
def test_project(self): with TestPipeline() as p: out = (p | beam.Create([SimpleRow(1, "foo", 3.14)]) | SqlTransform("SELECT `id`, `flt` FROM PCOLLECTION")) assert_that(out, equal_to([(1, 3.14)]))
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from Json into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner') parser.add_argument('--input_path', required=True, help='Path to events.json') parser.add_argument('--raw_table_name', required=True, help='BigQuery table for raw data') parser.add_argument('--agg_table_name', required=True, help='BigQuery table for aggregated data') opts, pipeline_opts = parser.parse_known_args() # Setting up the Beam pipeline options options = PipelineOptions(pipeline_opts, save_main_session=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'batch-user-traffic-pipeline-sql-', time.time_ns()) options.view_as(StandardOptions).runner = opts.runner input_path = opts.input_path agg_table_name = opts.agg_table_name raw_table_name = opts.raw_table_name # Table schema for BigQuery raw_table_schema = { "fields": [{ "name": "ip", "type": "STRING" }, { "name": "user_id", "type": "STRING" }, { "name": "lat", "type": "FLOAT" }, { "name": "lng", "type": "FLOAT" }, { "name": "timestamp", "type": "STRING" }, { "name": "http_request", "type": "STRING" }, { "name": "http_response", "type": "INTEGER" }, { "name": "num_bytes", "type": "INTEGER" }, { "name": "user_agent", "type": "STRING" }] } # Table schema for BigQuery agg_table_schema = { "fields": [ { "name": "user_id", "type": "STRING" }, { "name": "page_views", "type": "INTEGER" }, { "name": "total_bytes", "type": "INTEGER" }, { "name": "max_bytes", "type": "INTEGER" }, { "name": "min_bytes", "type": "INTEGER" }, ] } query = """ SELECT user_id, COUNT(*) AS page_views, SUM(num_bytes) as total_bytes, MAX(num_bytes) AS max_bytes, MIN(num_bytes) as min_bytes FROM PCOLLECTION GROUP BY user_id """ # Create the pipeline p = beam.Pipeline(options=options) logs = (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path) | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)) (logs | 'RawToDict' >> beam.Map(lambda row: row._asdict()) | 'WriteRawToBQ' >> beam.io.WriteToBigQuery( raw_table_name, schema=raw_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) (logs | 'PerUserAggregations' >> SqlTransform(query, dialect='zetasql') | 'AggToDict' >> beam.Map(lambda row: row._asdict()) | 'WriteAggToBQ' >> beam.io.WriteToBigQuery( agg_table_name, schema=agg_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
import apache_beam as beam from apache_beam.transforms.sql import SqlTransform import typing class Transaction(typing.NamedTuple): bank: str purchase_amount: float # Running locally in the DirectRunner. input_pc = [{ "bank": 'Om Sai Ram', "purchase_amount": 9999.99 }, { "bank": 'Om Sai Ram1', "purchase_amount": 99999.99 } ] output_pc = input_pc | beam.Map(lambda item: beam.Row(bank=str(item["bank"]), purchase_amount=float(item["purchase_amount"]))) print(output_pc) sql_pc = output_pc | SqlTransform("SELECT * FROM PCOLLECTION")
def run(): # Command line arguments parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery') parser.add_argument('--project',required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner') parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic') parser.add_argument('--table_name', required=True, help='BigQuery table name for aggregate results') opts, pipeline_opts = parser.parse_known_args() # Setting up the Beam pipeline options options = PipelineOptions(pipeline_opts, save_main_session=True, streaming=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as(GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-sql-pipeline-',time.time_ns()) options.view_as(StandardOptions).runner = opts.runner input_topic = opts.input_topic table_name = opts.table_name # Table schema for BigQuery table_schema = { "fields": [ { "name": "page_views", "type": "INTEGER" }, { "name": "start_time", "type": "STRING" }, ] } query = ''' SELECT COUNT(*) AS page_views, STRING(window_start) AS start_time FROM TUMBLE( (SELECT TIMESTAMP(event_timestamp) AS ts FROM PCOLLECTION), DESCRIPTOR(ts), 'INTERVAL 1 MINUTE') GROUP BY window_start ''' # Create the pipeline p = beam.Pipeline(options=options) (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic) | 'ParseAndGetEventTimestamp' >> ParseAndGetEventTimestamp().with_output_types(CommonLog) | "CountPerMinute" >> SqlTransform(query, dialect='zetasql') | "ConvertToDict" >> beam.Map(to_dict) | 'WriteAggToBQ' >> beam.io.WriteToBigQuery( table_name, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ) ) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run().wait_until_finish()