def test(self):
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid  # pylint: disable=reimported
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Make mutations' >> FlatMap(make_insert_mutations)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to Spanner' >> WriteToSpanner(
                project_id=self.project,
                instance_id=self.spanner_instance,
                database_id=self.TEST_DATABASE,
                max_batch_size_bytes=5120))
Exemplo n.º 2
0
 def test(self):
   self.result = (self.pipeline
                  | 'Read from BigQuery' >> Read(BigQuerySource(
                      dataset=self.input_dataset, table=self.input_table))
                  | 'Count messages' >> ParDo(CountMessages(
                      self.metrics_namespace))
                  | 'Measure time' >> ParDo(MeasureTime(
                      self.metrics_namespace))
                  | 'Count' >> Count.Globally())
Exemplo n.º 3
0
 def test(self):
   output = (
       self.pipeline
       | 'Read from BigQuery' >> Read(
           BigQuerySource(dataset=self.input_dataset, table=self.input_table))
       | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
       | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
       | 'Count' >> Count.Globally())
   assert_that(output, equal_to([self.input_options['num_records']]))
Exemplo n.º 4
0
def run(argv=None):
    class MessageParser(beam.DoFn):
        # It is required to parse messages for GBK operation.
        # Otherwise there are encoding problems.
        def process(self, item):
            if item.attributes:
                k, v = item.attributes.popitem()
                yield (str(k), str(v))

    class ParserToBytes(beam.DoFn):
        # Parsing to bytes is required for saving in PubSub.
        def process(self, item):
            _, v = item
            yield bytes(v, encoding='utf8')

    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    parser.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    parser.add_argument('--metrics_namespace',
                        help=('Namespace of metrics '
                              '"string".'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # pylint: disable=expression-not-assigned
    (p
     | ReadFromPubSub(subscription=known_args.input_subscription,
                      with_attributes=True)
     | 'Window' >> beam.WindowInto(window.FixedWindows(1000, 0))
     | 'Measure time: Start' >> beam.ParDo(
         MeasureTime(known_args.metrics_namespace))
     | 'Count messages' >> beam.ParDo(
         CountMessages(known_args.metrics_namespace))
     | 'Parse' >> beam.ParDo(MessageParser())
     | 'GroupByKey' >> beam.GroupByKey()
     | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]])
     | 'Measure time: End' >> beam.ParDo(
         MeasureTime(known_args.metrics_namespace))
     | 'Parse to bytes' >> beam.ParDo(ParserToBytes())
     | 'Write' >> beam.io.WriteToPubSub(topic=known_args.output_topic))

    result = p.run()
    result.wait_until_finish()
    logging.error(result)
    return result
Exemplo n.º 5
0
 def test(self):
     output = (
         self.pipeline
         | 'Read from Spanner' >> ReadFromSpanner(
             self.project,
             self.spanner_instance,
             self.spanner_database,
             sql="select data from test_data")
         | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
         | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
         | 'Count' >> Count.Globally())
     assert_that(output, equal_to([self.input_options['num_records']]))
Exemplo n.º 6
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))