Пример #1
0
    def testValueProviderFilters(self):
        self.vp_filters = [
            [(StaticValueProvider(str, 'property_name'),
              StaticValueProvider(str, '='), StaticValueProvider(str,
                                                                 'value'))],
            [(StaticValueProvider(str, 'property_name'),
              StaticValueProvider(str, '='), StaticValueProvider(str,
                                                                 'value')),
             ('property_name', '=', 'value')],
        ]
        self.expected_filters = [
            [('property_name', '=', 'value')],
            [('property_name', '=', 'value'), ('property_name', '=', 'value')],
        ]

        for vp_filter, exp_filter in zip(self.vp_filters,
                                         self.expected_filters):
            q = Query(kind='kind',
                      project=self._PROJECT,
                      namespace=self._NAMESPACE,
                      filters=vp_filter)
            cq = q._to_client_query(self._test_client)
            self.assertEqual(exp_filter, cq.filters)

            logging.info('query: %s', q)  # Test __repr__()
Пример #2
0
def create_multi_datasource_reader(pipeline,
                                   project,
                                   namespace,
                                   kinds,
                                   keys_only=False):
    if not kinds:
        kinds = [None]

    sources = []
    for kind in kinds:
        # namespace を指定しない(==None)と [default] namespace が使われる
        query = Query(project=project, namespace=namespace, kind=kind)
        if keys_only:
            # see
            # https://beam.apache.org/releases/pydoc/2.14.0/_modules/apache_beam/io/gcp/datastore/v1new/types.html#Query
            # https://google-cloud-python.readthedocs.io/en/0.32.0/_modules/google/cloud/datastore/query.html#Query.keys_only
            query.projection = ['__key__']
        if not kind:
            # kind を指定しない場合は明示的に __key__ asc でソートしないとエラーになる
            query.order = ['__key__']

        description = 'ReadFromDatastore kind={}'.format(kind if kind else "*")

        s = pipeline | description >> ReadFromDatastore(query=query)
        sources.append(s)
    return sources
Пример #3
0
 def testQueryEmptyNamespace(self):
     # Test that we can pass a namespace of None.
     self._test_client.namespace = None
     q = Query(project=self._PROJECT, namespace=None)
     cq = q._to_client_query(self._test_client)
     self.assertEqual(self._test_client.project, cq.project)
     self.assertEqual(None, cq.namespace)
Пример #4
0
    def testQuery(self):
        filters = [('property_name', '=', 'value')]
        projection = ['f1', 'f2']
        order = projection
        distinct_on = projection
        ancestor_key = Key(['kind', 'id'], project=self._PROJECT)
        q = Query(kind='kind',
                  project=self._PROJECT,
                  namespace=self._NAMESPACE,
                  ancestor=ancestor_key,
                  filters=filters,
                  projection=projection,
                  order=order,
                  distinct_on=distinct_on)
        cq = q._to_client_query(self._test_client)
        self.assertEqual(self._PROJECT, cq.project)
        self.assertEqual(self._NAMESPACE, cq.namespace)
        self.assertEqual('kind', cq.kind)
        self.assertEqual(ancestor_key.to_client_key(), cq.ancestor)
        self.assertEqual(filters, cq.filters)
        self.assertEqual(projection, cq.projection)
        self.assertEqual(order, cq.order)
        self.assertEqual(distinct_on, cq.distinct_on)

        logging.info('query: %s', q)  # Test __repr__()
Пример #5
0
  def testValueProviderNamespace(self):
    self.vp_namespace = StaticValueProvider(str, 'vp_namespace')
    self.expected_namespace = 'vp_namespace'

    q = Query(kind='kind', project=self._PROJECT, namespace=self.vp_namespace)
    cq = q._to_client_query(self._test_client)
    self.assertEqual(self.expected_namespace, cq.namespace)

    _LOGGER.info('query: %s', q)  # Test __repr__()
Пример #6
0
def model_datastoreio():
  """Using a Read and Write transform to read/write to Cloud Datastore."""

  import uuid
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
  from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
  from apache_beam.io.gcp.datastore.v1new.types import Entity
  from apache_beam.io.gcp.datastore.v1new.types import Key
  from apache_beam.io.gcp.datastore.v1new.types import Query

  project = 'my_project'
  kind = 'my_kind'
  query = Query(kind, project)

  # [START model_datastoreio_read]
  p = beam.Pipeline(options=PipelineOptions())
  entities = p | 'Read From Datastore' >> ReadFromDatastore(query)
  # [END model_datastoreio_read]

  # [START model_datastoreio_write]
  p = beam.Pipeline(options=PipelineOptions())
  musicians = p | 'Musicians' >> beam.Create(
      ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])

  def to_entity(content):
    key = Key([kind, str(uuid.uuid4())])
    entity = Entity(key)
    entity.set_properties({'content': content})
    return entity

  entities = musicians | 'To Entity' >> beam.Map(to_entity)
  entities | 'Write To Datastore' >> WriteToDatastore(project)
Пример #7
0
    def process(self, element):
        """

        :param element: a kind name
        :return: [Query]
        """
        from apache_beam.io.gcp.datastore.v1new.types import Query
        return [Query(kind=element, project=self.project_id)]
Пример #8
0
def make_ancestor_query(project, kind, namespace, ancestor):
    """Creates a Cloud Datastore ancestor query.

  The returned query will fetch all the entities that have the parent key name
  set to the given `ancestor`.
  """
    ancestor_key = Key([kind, ancestor], project=project, namespace=namespace)
    return Query(kind, project, namespace, ancestor_key)
Пример #9
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')

    test_metadata_entities = (
        p
        | 'ReadFromDatastore(TestMetadata)' >> ReadFromDatastore(
            Query(project=project, kind='TestMetadata')))

    # TODO: fetch SparseDiagnostics entities and join with TestMetadata here for
    # additional metadata.

    test_metadata_rows = (
        test_metadata_entities
        | 'ConvertEntityToRow(TestMetadata)' >> beam.FlatMap(
            ConvertEntity(TestMetadataEntityToRowDict, entities_read,
                          failed_entity_transforms)))

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.test_metadata`
  (test STRING NOT NULL,
   internal_only BOOLEAN NOT NULL,
   improvement_direction STRING,
   units STRING,
   has_rows BOOLEAN NOT NULL,
   deprecated BOOLEAN NOT NULL,
   description STRING,
   unescaped_story_name STRING,
   parent STRING,
   bot_group STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING NOT NULL,
   )
  CLUSTER BY bot_group, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_testmetadata_schema = {
        'fields': [
            # 'test' corresponds to the same column in the Rows export.
            {
                'name': 'test',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'internal_only',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'improvement_direction',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'units',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'has_rows',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'deprecated',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'description',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'unescaped_story_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'parent',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            # bot_group, bot, and measurement correspond to same columns in the
            # Rows export.
            {
                'name': 'bot_group',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
        ],
    }

    def TableNameFn(unused_element):
        return '{project}:{dataset}.test_metadata{suffix}'.format(
            project=project,
            dataset=bq_export_options.dataset.get(),
            suffix=bq_export_options.table_suffix)

    _ = (
        test_metadata_rows
        | 'WriteToBigQuery(test_metadata)' >> beam.io.WriteToBigQuery(
            TableNameFn,
            schema=bq_testmetadata_schema,
            method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
            # Cluster by the same columns as the Rows export, so that efficient
            # queries work the same way with this table (and to make efficient
            # joins with that table simpler).
            additional_bq_parameters={
                'clustering': {
                    'fields': ['bot_group', 'bot', 'measurement']
                }
            }))

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
Пример #10
0
def run(argv=None):
    """Main entry point."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--kind',
                        dest='kind',
                        default='writereadtest',
                        help='Datastore Kind')
    parser.add_argument('--num_entities',
                        dest='num_entities',
                        type=int,
                        required=True,
                        help='Number of entities to write')
    parser.add_argument('--limit',
                        dest='limit',
                        type=int,
                        help='Limit of number of entities to write')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    job_name = gcloud_options.job_name
    kind = known_args.kind
    num_entities = known_args.num_entities
    project = gcloud_options.project

    # Pipeline 1: Create and write the specified number of Entities to the
    # Cloud Datastore.
    ancestor_key = Key([kind, str(uuid.uuid4())], project=project)
    _LOGGER.info('Writing %s entities to %s', num_entities, project)
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-write')
    _ = (
        p
        | 'Input' >> beam.Create(list(range(num_entities)))
        | 'To String' >> beam.Map(str)
        |
        'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity)
        | 'Write to Datastore' >> WriteToDatastore(project))
    p.run()

    query = Query(kind=kind, project=project, ancestor=ancestor_key)
    # Optional Pipeline 2: If a read limit was provided, read it and confirm
    # that the expected entities were read.
    if known_args.limit is not None:
        _LOGGER.info(
            'Querying a limited set of %s entities and verifying count.',
            known_args.limit)
        p = new_pipeline_with_job_name(pipeline_options, job_name,
                                       '-verify-limit')
        query.limit = known_args.limit
        entities = p | 'read from datastore' >> ReadFromDatastore(query)
        assert_that(entities | beam.combiners.Count.Globally(),
                    equal_to([known_args.limit]))

        p.run()
        query.limit = None

    # Pipeline 3: Query the written Entities and verify result.
    _LOGGER.info('Querying entities, asserting they match.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(),
                equal_to([num_entities]))

    p.run()

    # Pipeline 4: Delete Entities.
    _LOGGER.info('Deleting entities.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)
    _ = (entities
         | 'To Keys' >> beam.Map(lambda entity: entity.key)
         | 'delete entities' >> DeleteFromDatastore(project))

    p.run()

    # Pipeline 5: Query the written Entities, verify no results.
    _LOGGER.info(
        'Querying for the entities to make sure there are none present.')
    p = new_pipeline_with_job_name(pipeline_options, job_name,
                                   '-verify-deleted')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(), equal_to([0]))

    p.run()
Пример #11
0
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1new.types import Query

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)
project = pipeline_options.view_as(GoogleCloudOptions).project

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)

data = p | 'Read from Datastore' >> ReadFromDatastore(
    query=Query('natality-guid', project, limit=5))
scored = data | 'Print' >> beam.Map(print)

# run the pipeline
result = p.run()
result.wait_until_finish()