Python SqlTransform примеры, apache_beam.transforms.sql.SqlTransform Python примеры использования

Пример #1

0

Показать файл

Файл: pipeline.py Проект: swetasingh-tudip/training-data-analyst

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from Json into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
        'performance-demo-', time.time_ns())
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    agg_output_path = f"{opts.project}/agg_output/"
    raw_output_path = f"{opts.project}/raw_output/"

    query = """
        SELECT user_id,
        COUNT(*) AS page_views, SUM(num_bytes) as total_bytes,
        MAX(num_bytes) AS max_bytes, MIN(num_bytes) as min_bytes
        FROM PCOLLECTION
        GROUP BY user_id
        """

    # Create the pipeline
    p = beam.Pipeline(options=options)

    logs = (p | 'ReadFromGCS' >> beam.io.ReadFromText('./users.csv')
            | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog))

    (logs | 'RawToDict' >> beam.Map(lambda row: row._asdict())
     #| 'WriteRawToText' >> beam.io.WriteToText(raw_output_path)
     )

    (logs | 'PerUserAggregations' >> SqlTransform(query, dialect='zetasql')
     | 'AggToDict' >> beam.Map(lambda row: row._asdict())
     #| 'WriteAggToText' >> beam.io.WriteToText(agg_output_path)
     )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

Пример #2

0

Показать файл

Файл: beam_sql_magics.py Проект: mszb/beam

def apply_sql(query: str, output_name: Optional[str],
              found: Dict[str, beam.PCollection]) -> Tuple[str, PValue]:
    """Applies a SqlTransform with the given sql and queried PCollections.

  Args:
    query: The SQL query executed in the magic.
    output_name: (optional) The output variable name in __main__ module.
    found: The PCollections with variable names found to be used in the query.

  Returns:
    A Tuple[str, PValue]. First str value is the output variable name in
    __main__ module (auto-generated if not provided). Second PValue is
    most likely a PCollection, depending on the query.
  """
    output_name = _generate_output_name(output_name, query, found)
    query, sql_source = _build_query_components(query, found)
    try:
        output = sql_source | SqlTransform(query)
        # Declare a variable with the output_name and output value in the
        # __main__ module so that the user can use the output smoothly.
        setattr(importlib.import_module('__main__'), output_name, output)
        ib.watch({output_name: output})
        _LOGGER.info(
            "The output PCollection variable is %s with element_type %s",
            output_name, pformat_namedtuple(output.element_type))
        return output_name, output
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
        on_error('Error when applying the Beam SQL: %s', e)

Пример #3

0

Показать файл

Файл: beam_sql.py Проект: beancount/beanlabs

def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(levelname)-8s: %(message)s')
    parser = argparse.ArgumentParser(description=__doc__.strip())
    parser.add_argument('filename', help='Beancount ledger filename')
    args, pipeline_args = parser.parse_known_args()

    # Read the ledger.
    logging.info("Reading ledger.")
    t1 = time.time()
    entries, errors, options_map = loader.load_file(args.filename)
    postings = (beam.Row(posting.account, posting.units.number,
                         posting.units.currency)
                for entry in data.filter_txns(entries)
                for posting in entry.postings)
    price_map = prices.build_price_map(entries)
    t2 = time.time()
    logging.info("Read ledger in %.1fsecs.", t2 - t1)

    with CreatePipeline(pipeline_args) as pipeline:
        _ = (pipeline
             | beam.Create(postings)
             | SqlTransform("""
                 SELECT account FROM PCOLLECTION
             """,
                            dialect="zetasql")
             | beam.Map(print))

Пример #4

0

Показать файл

 def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline:
     """Converts the chain into an executable pipeline."""
     if pipeline not in self.evaluated:
         # The whole chain should form a single pipeline.
         source = self.source
         if isinstance(self.source, beam.Pipeline):
             if pipeline:  # use the known pipeline
                 source = pipeline
             else:  # use the source pipeline
                 pipeline = self.source
         else:
             name_to_pcoll = pcoll_by_name()
             if len(self.source) == 1:
                 source = name_to_pcoll.get(next(iter(self.source)))
             else:
                 source = {s: name_to_pcoll.get(s) for s in self.source}
         if isinstance(source, beam.Pipeline):
             output = source | 'beam_sql_{}_{}'.format(
                 self.output_name, self.execution_count) >> SqlTransform(
                     self.query)
         else:
             output = source | 'schema_loaded_beam_sql_{}_{}'.format(
                 self.output_name,
                 self.execution_count) >> SchemaLoadedSqlTransform(
                     self.output_name, self.query, self.schemas,
                     self.execution_count)
         _ = create_var_in_main(self.output_name, output)
         self.evaluated.add(pipeline)
     if self.next:
         return self.next.to_pipeline(pipeline)
     else:
         return pipeline

Пример #5

0

Показать файл

 def test_generate_data(self):
     with TestPipeline() as p:
         out = p | SqlTransform("""SELECT
         CAST(1 AS INT) AS `id`,
         CAST('foo' AS VARCHAR) AS `str`,
         CAST(3.14  AS DOUBLE) AS `flt`""")
         assert_that(out, equal_to([(1, "foo", 3.14)]))

Пример #6

0

Показать файл

 def test_filter(self):
   with TestPipeline() as p:
     out = (
         p
         | beam.Create([SimpleRow(1, "foo", 3.14), SimpleRow(2, "bar", 1.414)])
         | SqlTransform("SELECT * FROM PCOLLECTION WHERE `str` = 'bar'"))
     assert_that(out, equal_to([(2, "bar", 1.414)]))

Пример #7

0

Показать файл

 def test_zetasql_generate_data(self):
     with TestPipeline() as p:
         out = p | SqlTransform("""SELECT
         CAST(1 AS INT64) AS `int`,
         CAST('foo' AS STRING) AS `str`,
         CAST(3.14  AS FLOAT64) AS `flt`""",
                                dialect="zetasql")
         assert_that(out, equal_to([(1, "foo", 3.14)]))

Пример #8

0

Показать файл

 def test_row(self):
     with TestPipeline() as p:
         out = (p
                | beam.Create([1, 2, 10])
                | beam.Map(lambda x: beam.Row(a=x, b=str(x)))
                | SqlTransform(
                    "SELECT a*a as s, LENGTH(b) AS c FROM PCOLLECTION"))
         assert_that(out, equal_to([(1, 1), (4, 1), (100, 2)]))

Пример #9

0

Показать файл

 def test_sql(self):
   with self.create_pipeline() as p:
     output = (
         p
         | 'Create' >> beam.Create([Row(x, str(x)) for x in range(5)])
         | 'Sql' >> SqlTransform(
             """SELECT col1, col2 || '*' || col2 as col2,
                   power(col1, 2) as col3
            FROM PCOLLECTION
         """,
             expansion_service=self.get_expansion_service()))
     assert_that(
         output,
         equal_to([(x, '{x}*{x}'.format(x=x), x * x) for x in range(5)]))

Пример #10

0

Показать файл

 def test_map(self):
   with TestPipeline() as p:
     out = (
         p
         | beam.Create([
             Shopper('bob', {
                 'bananas': 6, 'cherries': 3
             }),
             Shopper('alice', {
                 'apples': 2, 'bananas': 3
             })
         ]).with_output_types(Shopper)
         | SqlTransform("SELECT * FROM PCOLLECTION WHERE shopper = 'alice'"))
     assert_that(out, equal_to([('alice', {'apples': 2, 'bananas': 3})]))

Пример #11

0

Показать файл

 def test_windowing_before_sql(self):
     with TestPipeline() as p:
         out = (
             p | beam.Create([
                 SimpleRow(5, "foo", 1.),
                 SimpleRow(15, "bar", 2.),
                 SimpleRow(25, "baz", 3.)
             ])
             | beam.Map(lambda v: beam.window.TimestampedValue(v, v.id)
                        ).with_output_types(SimpleRow)
             | beam.WindowInto(
                 beam.window.FixedWindows(10)).with_output_types(SimpleRow)
             | SqlTransform("SELECT COUNT(*) as `count` FROM PCOLLECTION"))
         assert_that(out, equal_to([(1, ), (1, ), (1, )]))

Пример #12

0

Показать файл

Файл: wordcount_xlang_sql.py Проект: Shelna-GCP/SentinelToEVI-dataflow

def run(p, input_file, output_file):
    #pylint: disable=expression-not-assigned
    (p
     | 'read' >> ReadFromText(input_file)
     | 'split' >> beam.FlatMap(str.split)
     | 'row' >> beam.Map(MyRow).with_output_types(MyRow)
     | 'sql!!' >> SqlTransform("""
                   SELECT
                     word as key,
                     COUNT(*) as `count`
                   FROM PCOLLECTION
                   GROUP BY word""")
     | 'format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count))
     | 'write' >> WriteToText(output_file))

    result = p.run()
    result.wait_until_finish()

Пример #13

0

Показать файл

def run(p, input_file, output_file):
    #pylint: disable=expression-not-assigned
    (p
     # Read the lines from a text file.
     | 'Read' >> ReadFromText(input_file)
     # Split the line into individual words.
     | 'Split' >> beam.FlatMap(lambda line: re.split(r'\W+', line))
     # Map each word to an instance of MyRow.
     | 'ToRow' >> beam.Map(MyRow).with_output_types(MyRow)
     # SqlTransform yields a PCollection containing elements with attributes
     # based on the output of the query.
     | 'Sql!!' >> SqlTransform("""
                   SELECT
                     word as key,
                     COUNT(*) as `count`
                   FROM PCOLLECTION
                   GROUP BY word""")
     | 'Format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count))
     | 'Write' >> WriteToText(output_file))

Пример #14

0

Показать файл

Файл: sql_taxi.py Проект: henrik680/RawImporterRenewableStatsSwe

def run(output_topic, pipeline_args):
    pipeline_options = PipelineOptions(pipeline_args,
                                       save_main_session=True,
                                       streaming=True)

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime',
                timestamp_attribute="ts").with_output_types(bytes)
            | "Parse JSON payload" >> beam.Map(json.loads)
            # Use beam.Row to create a schema-aware PCollection
            | "Create beam Row" >> beam.Map(
                lambda x: beam.Row(ride_status=str(x['ride_status']),
                                   passenger_count=int(x['passenger_count'])))
            # SqlTransform will computes result within an existing window
            | "15s fixed windows" >> beam.WindowInto(
                beam.window.FixedWindows(15))
            # Aggregate drop offs and pick ups that occur within each 15s window
            | SqlTransform("""
             SELECT
               ride_status,
               COUNT(*) AS num_rides,
               SUM(passenger_count) AS total_passengers
             FROM PCOLLECTION
             WHERE NOT ride_status = 'enroute'
             GROUP BY ride_status""")
            # SqlTransform yields python objects with attributes corresponding to
            # the outputs of the query.
            # Collect those attributes, as well as window information, into a dict
            | "Assemble Dictionary" >> beam.Map(
                lambda row, window=beam.DoFn.WindowParam: {
                    "ride_status": row.ride_status,
                    "num_rides": row.num_rides,
                    "total_passengers": row.total_passengers,
                    "window_start": window.start.to_rfc3339(),
                    "window_end": window.end.to_rfc3339()
                })
            | "Convert to JSON" >> beam.Map(json.dumps)
            | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8"))
            | beam.io.WriteToPubSub(topic=output_topic))

Пример #15

0

Показать файл

def apply_sql(
    query: str,
    output_name: Optional[str],
    found: Dict[str, beam.PCollection],
    run: bool = True) -> Tuple[str, Union[PValue, SqlNode], SqlChain]:
  """Applies a SqlTransform with the given sql and queried PCollections.

  Args:
    query: The SQL query executed in the magic.
    output_name: (optional) The output variable name in __main__ module.
    found: The PCollections with variable names found to be used in the query.
    run: Whether to prepare the SQL pipeline for a local run or not.

  Returns:
    A tuple of values. First str value is the output variable name in
    __main__ module, auto-generated if not provided. Second value: if run,
    it's a PValue; otherwise, a SqlNode tracks the SQL without applying it or
    executing it. Third value: SqlChain is a chain of SqlNodes that have been
    applied.
  """
  output_name = _generate_output_name(output_name, query, found)
  query, sql_source, chain = _build_query_components(
      query, found, output_name, run)
  if run:
    try:
      output = sql_source | SqlTransform(query)
      # Declare a variable with the output_name and output value in the
      # __main__ module so that the user can use the output smoothly.
      output_name, output = create_var_in_main(output_name, output)
      _LOGGER.info(
          "The output PCollection variable is %s with element_type %s",
          output_name,
          pformat_namedtuple(output.element_type))
      return output_name, output, chain
    except (KeyboardInterrupt, SystemExit):
      raise
    except:  # pylint: disable=bare-except
      on_error('Error when applying the Beam SQL: %s', traceback.format_exc())
      raise
  else:
    return output_name, chain.current, chain

Пример #16

0

Показать файл

 def test_agg(self):
     with TestPipeline() as p:
         out = (p
                | beam.Create([
                    SimpleRow(1, "foo", 1.),
                    SimpleRow(1, "foo", 2.),
                    SimpleRow(1, "foo", 3.),
                    SimpleRow(2, "bar", 1.414),
                    SimpleRow(2, "bar", 1.414),
                    SimpleRow(2, "bar", 1.414),
                    SimpleRow(2, "bar", 1.414),
                ])
                | SqlTransform("""
           SELECT
             `str`,
             COUNT(*) AS `count`,
             SUM(`id`) AS `sum`,
             AVG(`flt`) AS `avg`
           FROM PCOLLECTION GROUP BY `str`"""))
         assert_that(out, equal_to([("foo", 3, 3, 2),
                                    ("bar", 4, 8, 1.414)]))

Пример #17

0

Показать файл

 def test_tagged_join(self):
     with TestPipeline() as p:
         enrich = (p | "Create enrich" >> beam.Create(
             [Enrich(1, "a"),
              Enrich(2, "b"),
              Enrich(26, "z")]))
         simple = (p | "Create simple" >> beam.Create([
             SimpleRow(1, "foo", 3.14),
             SimpleRow(26, "bar", 1.11),
             SimpleRow(1, "baz", 2.34)
         ]))
         out = ({
             'simple': simple,
             'enrich': enrich
         }
                | SqlTransform("""
           SELECT
             simple.`id` AS `id`,
             enrich.metadata AS metadata
           FROM simple
           JOIN enrich
           ON simple.`id` = enrich.`id`"""))
         assert_that(out, equal_to([(1, "a"), (26, "z"), (1, "a")]))

Пример #18

0

Показать файл

 def expand(self, source):
     """Applies the SQL transform. If a PCollection uses a schema defined in
 the main session, use the additional DoFn to restore it on the worker."""
     if isinstance(source, dict):
         schema_loaded = {
             tag: pcoll | 'load_schemas_{}_tag_{}_{}'.format(
                 self.output_name, tag, self.execution_count) >> beam.ParDo(
                     self._SqlTransformDoFn(self.schemas,
                                            self.schema_annotations))
             if pcoll.element_type in self.schemas else pcoll
             for tag, pcoll in source.items()
         }
     elif isinstance(source, beam.pvalue.PCollection):
         schema_loaded = source | 'load_schemas_{}_{}'.format(
             self.output_name, self.execution_count) >> beam.ParDo(
                 self._SqlTransformDoFn(self.schemas,
                                        self.schema_annotations)
             ) if source.element_type in self.schemas else source
     else:
         raise ValueError(
             '{} should be either a single PCollection or a dict of named '
             'PCollections.'.format(source))
     return schema_loaded | 'beam_sql_{}_{}'.format(
         self.output_name, self.execution_count) >> SqlTransform(self.query)

Пример #19

0

Показать файл

import typing

from apache_beam.portability.common_urns import coders
from apache_beam.transforms.sql import SqlTransform
from past.builtins import unicode

Purchase = typing.NamedTuple('Purchase', [('item_name', unicode),
                                          ('price', float)])
coders.registry.register_coder(Purchase, coders.RowCoder)

xx = Purchase | SqlTransform("""
              SELECT item_name, COUNT(*) AS `count`
              FROM PCOLLECTION GROUP BY item_name""")

Пример #20

0

Показать файл

 def test_project(self):
     with TestPipeline() as p:
         out = (p | beam.Create([SimpleRow(1, "foo", 3.14)])
                | SqlTransform("SELECT `id`, `flt` FROM PCOLLECTION"))
         assert_that(out, equal_to([(1, 3.14)]))

Пример #21

0

Показать файл

Файл: batch_user_traffic_SQL_pipeline.py Проект: tapanrout/training-data-analyst

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from Json into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner',
                        required=True,
                        help='Specify Apache Beam Runner')
    parser.add_argument('--input_path',
                        required=True,
                        help='Path to events.json')
    parser.add_argument('--raw_table_name',
                        required=True,
                        help='BigQuery table for raw data')
    parser.add_argument('--agg_table_name',
                        required=True,
                        help='BigQuery table for aggregated data')

    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
        'batch-user-traffic-pipeline-sql-', time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    agg_table_name = opts.agg_table_name
    raw_table_name = opts.raw_table_name

    # Table schema for BigQuery
    raw_table_schema = {
        "fields": [{
            "name": "ip",
            "type": "STRING"
        }, {
            "name": "user_id",
            "type": "STRING"
        }, {
            "name": "lat",
            "type": "FLOAT"
        }, {
            "name": "lng",
            "type": "FLOAT"
        }, {
            "name": "timestamp",
            "type": "STRING"
        }, {
            "name": "http_request",
            "type": "STRING"
        }, {
            "name": "http_response",
            "type": "INTEGER"
        }, {
            "name": "num_bytes",
            "type": "INTEGER"
        }, {
            "name": "user_agent",
            "type": "STRING"
        }]
    }

    # Table schema for BigQuery
    agg_table_schema = {
        "fields": [
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "total_bytes",
                "type": "INTEGER"
            },
            {
                "name": "max_bytes",
                "type": "INTEGER"
            },
            {
                "name": "min_bytes",
                "type": "INTEGER"
            },
        ]
    }

    query = """
        SELECT user_id,
        COUNT(*) AS page_views, SUM(num_bytes) as total_bytes,
        MAX(num_bytes) AS max_bytes, MIN(num_bytes) as min_bytes
        FROM PCOLLECTION
        GROUP BY user_id
        """

    # Create the pipeline
    p = beam.Pipeline(options=options)

    logs = (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
            | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog))

    (logs | 'RawToDict' >> beam.Map(lambda row: row._asdict())
     | 'WriteRawToBQ' >> beam.io.WriteToBigQuery(
         raw_table_name,
         schema=raw_table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    (logs | 'PerUserAggregations' >> SqlTransform(query, dialect='zetasql')
     | 'AggToDict' >> beam.Map(lambda row: row._asdict())
     | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
         agg_table_name,
         schema=agg_table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

Пример #22

0

Показать файл

Файл: SaiStudy - Apache Beam JR-SQL.py Проект: meetreks/SaiStudyAll

import apache_beam as beam
from apache_beam.transforms.sql import SqlTransform
import typing


class Transaction(typing.NamedTuple):
    bank: str
    purchase_amount: float


# Running locally in the DirectRunner.
input_pc = [{
    "bank": 'Om Sai Ram',
    "purchase_amount": 9999.99

},
    {
        "bank": 'Om Sai Ram1',
        "purchase_amount": 99999.99
    }
]
output_pc = input_pc | beam.Map(lambda item: beam.Row(bank=str(item["bank"]),
                                                      purchase_amount=float(item["purchase_amount"])))
print(output_pc)
sql_pc = output_pc | SqlTransform("SELECT * FROM PCOLLECTION")

Пример #23

0

Показать файл

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic')
    parser.add_argument('--table_name', required=True, help='BigQuery table name for aggregate results')


    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True, streaming=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-sql-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_topic = opts.input_topic
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "start_time",
                "type": "STRING"
            },

        ]
    }

    query = '''
        SELECT
            COUNT(*) AS page_views,
            STRING(window_start) AS start_time
        FROM
            TUMBLE(
                (SELECT TIMESTAMP(event_timestamp) AS ts FROM PCOLLECTION),
                DESCRIPTOR(ts),
                'INTERVAL 1 MINUTE')
        GROUP BY window_start
    '''

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
       | 'ParseAndGetEventTimestamp' >> ParseAndGetEventTimestamp().with_output_types(CommonLog)
       | "CountPerMinute" >> SqlTransform(query, dialect='zetasql')
       | "ConvertToDict" >> beam.Map(to_dict)
       | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run().wait_until_finish()

Python SqlTransform примеры использования