def test(self): def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid # pylint: disable=reimported from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.TEST_DATABASE, max_batch_size_bytes=5120))
def test(self): self.result = (self.pipeline | 'Read from BigQuery' >> Read(BigQuerySource( dataset=self.input_dataset, table=self.input_table)) | 'Count messages' >> ParDo(CountMessages( self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime( self.metrics_namespace)) | 'Count' >> Count.Globally())
def test(self): output = ( self.pipeline | 'Read from BigQuery' >> Read( BigQuerySource(dataset=self.input_dataset, table=self.input_table)) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Count' >> Count.Globally()) assert_that(output, equal_to([self.input_options['num_records']]))
def run(argv=None): class MessageParser(beam.DoFn): # It is required to parse messages for GBK operation. # Otherwise there are encoding problems. def process(self, item): if item.attributes: k, v = item.attributes.popitem() yield (str(k), str(v)) class ParserToBytes(beam.DoFn): # Parsing to bytes is required for saving in PubSub. def process(self, item): _, v = item yield bytes(v, encoding='utf8') parser = argparse.ArgumentParser() parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument('--metrics_namespace', help=('Namespace of metrics ' '"string".')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # pylint: disable=expression-not-assigned (p | ReadFromPubSub(subscription=known_args.input_subscription, with_attributes=True) | 'Window' >> beam.WindowInto(window.FixedWindows(1000, 0)) | 'Measure time: Start' >> beam.ParDo( MeasureTime(known_args.metrics_namespace)) | 'Count messages' >> beam.ParDo( CountMessages(known_args.metrics_namespace)) | 'Parse' >> beam.ParDo(MessageParser()) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo( MeasureTime(known_args.metrics_namespace)) | 'Parse to bytes' >> beam.ParDo(ParserToBytes()) | 'Write' >> beam.io.WriteToPubSub(topic=known_args.output_topic)) result = p.run() result.wait_until_finish() logging.error(result) return result
def test(self): output = ( self.pipeline | 'Read from Spanner' >> ReadFromSpanner( self.project, self.spanner_instance, self.spanner_database, sql="select data from test_data") | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Count' >> Count.Globally()) assert_that(output, equal_to([self.input_options['num_records']]))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.output_dataset, table=self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))