def create_multi_datasource_reader(pipeline, project, namespace, kinds, keys_only=False): if not kinds: kinds = [None] sources = [] for kind in kinds: # namespace を指定しない(==None)と [default] namespace が使われる query = Query(project=project, namespace=namespace, kind=kind) if keys_only: # see # https://beam.apache.org/releases/pydoc/2.14.0/_modules/apache_beam/io/gcp/datastore/v1new/types.html#Query # https://google-cloud-python.readthedocs.io/en/0.32.0/_modules/google/cloud/datastore/query.html#Query.keys_only query.projection = ['__key__'] if not kind: # kind を指定しない場合は明示的に __key__ asc でソートしないとエラーになる query.order = ['__key__'] description = 'ReadFromDatastore kind={}'.format(kind if kind else "*") s = pipeline | description >> ReadFromDatastore(query=query) sources.append(s) return sources
def check_estimated_size_bytes(self, entity_bytes, timestamp, namespace=None): """A helper method to test get_estimated_size_bytes""" self._mock_client.namespace = namespace self._mock_client.query.return_value = self._mock_query self._mock_query.project = self._PROJECT self._mock_query.namespace = namespace self._mock_query.fetch.side_effect = [ [{'timestamp': timestamp}], [{'entity_bytes': entity_bytes}], ] self._mock_query.kind = self._KIND split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits=0) self.assertEqual(entity_bytes, split_query_fn.get_estimated_size_bytes(self._mock_client, self._mock_query)) if namespace is None: ns_keyword = '_' else: ns_keyword = '_Ns_' self._mock_client.query.assert_has_calls([ call(kind='__Stat%sTotal__' % ns_keyword, order=['-timestamp']), call().fetch(limit=1), call(kind='__Stat%sKind__' % ns_keyword), call().add_filter('kind_name', '=', self._KIND), call().add_filter('timestamp', '=', timestamp), call().fetch(limit=1), ])
def model_datastoreio(): """Using a Read and Write transform to read/write to Cloud Datastore.""" import uuid import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore from apache_beam.io.gcp.datastore.v1new.types import Entity from apache_beam.io.gcp.datastore.v1new.types import Key from apache_beam.io.gcp.datastore.v1new.types import Query project = 'my_project' kind = 'my_kind' query = Query(kind, project) # [START model_datastoreio_read] p = beam.Pipeline(options=PipelineOptions()) entities = p | 'Read From Datastore' >> ReadFromDatastore(query) # [END model_datastoreio_read] # [START model_datastoreio_write] p = beam.Pipeline(options=PipelineOptions()) musicians = p | 'Musicians' >> beam.Create( ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) def to_entity(content): key = Key([kind, str(uuid.uuid4())]) entity = Entity(key) entity.set_properties({'content': content}) return entity entities = musicians | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://wordcounttest2/data/datatest.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromDatastore(known_args.input) types = type(lines) # Count the occurrences of each word. counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s, %s' % (word, count, types) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(known_args.output)
def test_SplitQueryFn_with_query_limit(self): """A test that verifies no split is performed when the query has a limit.""" with patch.object(helper, 'get_client', return_value=self._mock_client): num_splits = 4 expected_num_splits = 1 self._mock_query.limit = 3 split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits) split_queries = split_query_fn.process(self._mock_query) self.assertEqual(expected_num_splits, len(split_queries))
def test_SplitQueryFn_with_num_splits(self): with patch.object(helper, 'get_client', return_value=self._mock_client): num_splits = 23 expected_num_splits = 23 def fake_get_splits(unused_client, query, num_splits): return [query] * num_splits with patch.object(query_splitter, 'get_splits', side_effect=fake_get_splits): split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits) split_queries = split_query_fn.process(self._mock_query) self.assertEqual(expected_num_splits, len(split_queries))
def run(argv=None): from apache_beam.io.gcp.bigquery_file_loads import BigQueryBatchFileLoads from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from datetime import datetime options = Ds2bqOptions(flags=argv) options.view_as(beam.options.pipeline_options.GoogleCloudOptions ).region = "asia-northeast1" options.view_as( beam.options.pipeline_options.WorkerOptions).num_workers = 2 options.view_as( beam.options.pipeline_options.WorkerOptions).disk_size_gb = 50 # Setup options.view_as(beam.options.pipeline_options.StandardOptions ).runner = 'DataflowRunner' options.view_as( beam.options.pipeline_options.SetupOptions).setup_file = './setup.py' logging.info(options) project_id = options.view_as( beam.options.pipeline_options.GoogleCloudOptions).project gcs_dir = "gs://{}-dataflow/temp/{}".format( project_id, datetime.now().strftime("%Y%m%d%H%M%S")) with beam.Pipeline(options=options) as p: from transform.datastore import convert, CreateQuery, GetKinds from transform.bigquery import GetBqTableMap, get_partition_conf table_names_dict = beam.pvalue.AsDict( p | "Get BigQuery Table Map" >> GetBqTableMap( project_id, options.dataset)) entities = (p | 'Get Kinds' >> GetKinds(project_id) | 'Create Query' >> beam.ParDo(CreateQuery(project_id)) | 'Get Entity' >> beam.ParDo(ReadFromDatastore._QueryFn())) _ = (entities | 'Convert Entity' >> beam.Map(convert) | 'BigQuery Load' >> BigQueryBatchFileLoads( destination=lambda row, table_dict: table_dict[row["__key__"][ "kind"]], custom_gcs_temp_location=gcs_dir, write_disposition='WRITE_TRUNCATE', table_side_inputs=(table_names_dict, ), additional_bq_parameters=get_partition_conf, schema='SCHEMA_AUTODETECT'))
def test_SplitQueryFn_with_exception(self): """A test that verifies that no split is performed when failures occur.""" with patch.object(helper, 'get_client', return_value=self._mock_client): # Force _SplitQueryFn to compute the number of query splits num_splits = 0 expected_num_splits = 1 entity_bytes = (expected_num_splits * ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES) with patch.object( ReadFromDatastore._SplitQueryFn, 'get_estimated_size_bytes', return_value=entity_bytes): with patch.object(query_splitter, 'get_splits', side_effect=query_splitter.QuerySplitterError( "Testing query split error")): split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits) split_queries = split_query_fn.process(self._mock_query) self.assertEqual(expected_num_splits, len(split_queries)) self.assertEqual(self._mock_query, split_queries[0])
def test_SplitQueryFn_without_num_splits(self): with patch.object(helper, 'get_client', return_value=self._mock_client): # Force _SplitQueryFn to compute the number of query splits num_splits = 0 expected_num_splits = 23 entity_bytes = (expected_num_splits * ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES) with patch.object( ReadFromDatastore._SplitQueryFn, 'get_estimated_size_bytes', return_value=entity_bytes): def fake_get_splits(unused_client, query, num_splits): return [query] * num_splits with patch.object(query_splitter, 'get_splits', side_effect=fake_get_splits): split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits) split_queries = split_query_fn.process(self._mock_query) self.assertEqual(expected_num_splits, len(split_queries))
def test_QueryFn_metric_on_failure(self): MetricsEnvironment.process_wide_container().reset() with patch.object(helper, 'get_client', return_value=self._mock_client): self._mock_query.project = self._PROJECT self._mock_query.namespace = self._NAMESPACE _query_fn = ReadFromDatastore._QueryFn() client_query = self._mock_query._to_client_query() # Test with exception client_query.fetch.side_effect = [ exceptions.DeadlineExceeded("Deadline exceed") ] list(_query_fn.process(self._mock_query)) self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "deadline_exceeded", 1) # Test success client_query.fetch.side_effect = [[]] list(_query_fn.process(self._mock_query)) self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "ok", 1)
def read_from_datastore(project, user_options, pipeline_options): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_ancestor_query(project, user_options.kind, user_options.namespace, user_options.ancestor) # Read entities from Cloud Datastore into a PCollection. lines = p | 'read from datastore' >> ReadFromDatastore(query) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return word, sum(ones) counts = (lines | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> beam.io.WriteToText( file_path_prefix=user_options.output, num_shards=user_options.num_shards) result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') test_metadata_entities = ( p | 'ReadFromDatastore(TestMetadata)' >> ReadFromDatastore( Query(project=project, kind='TestMetadata'))) # TODO: fetch SparseDiagnostics entities and join with TestMetadata here for # additional metadata. test_metadata_rows = ( test_metadata_entities | 'ConvertEntityToRow(TestMetadata)' >> beam.FlatMap( ConvertEntity(TestMetadataEntityToRowDict, entities_read, failed_entity_transforms))) """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.test_metadata` (test STRING NOT NULL, internal_only BOOLEAN NOT NULL, improvement_direction STRING, units STRING, has_rows BOOLEAN NOT NULL, deprecated BOOLEAN NOT NULL, description STRING, unescaped_story_name STRING, parent STRING, bot_group STRING NOT NULL, bot STRING NOT NULL, measurement STRING NOT NULL, ) CLUSTER BY bot_group, bot, measurement; """ # pylint: disable=pointless-string-statement bq_testmetadata_schema = { 'fields': [ # 'test' corresponds to the same column in the Rows export. { 'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'internal_only', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'improvement_direction', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'units', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'has_rows', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'deprecated', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'description', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'unescaped_story_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'parent', 'type': 'STRING', 'mode': 'NULLABLE' }, # bot_group, bot, and measurement correspond to same columns in the # Rows export. { 'name': 'bot_group', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bot', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'REQUIRED' }, ], } def TableNameFn(unused_element): return '{project}:{dataset}.test_metadata{suffix}'.format( project=project, dataset=bq_export_options.dataset.get(), suffix=bq_export_options.table_suffix) _ = ( test_metadata_rows | 'WriteToBigQuery(test_metadata)' >> beam.io.WriteToBigQuery( TableNameFn, schema=bq_testmetadata_schema, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, # Cluster by the same columns as the Rows export, so that efficient # queries work the same way with this table (and to make efficient # joins with that table simpler). additional_bq_parameters={ 'clustering': { 'fields': ['bot_group', 'bot', 'measurement'] } })) result = p.run() result.wait_until_finish() PrintCounters(result)
def run(argv=None): """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument('--kind', dest='kind', default='writereadtest', help='Datastore Kind') parser.add_argument('--num_entities', dest='num_entities', type=int, required=True, help='Number of entities to write') parser.add_argument('--limit', dest='limit', type=int, help='Limit of number of entities to write') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) gcloud_options = pipeline_options.view_as(GoogleCloudOptions) job_name = gcloud_options.job_name kind = known_args.kind num_entities = known_args.num_entities project = gcloud_options.project # Pipeline 1: Create and write the specified number of Entities to the # Cloud Datastore. ancestor_key = Key([kind, str(uuid.uuid4())], project=project) _LOGGER.info('Writing %s entities to %s', num_entities, project) p = new_pipeline_with_job_name(pipeline_options, job_name, '-write') _ = ( p | 'Input' >> beam.Create(list(range(num_entities))) | 'To String' >> beam.Map(str) | 'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity) | 'Write to Datastore' >> WriteToDatastore(project)) p.run() query = Query(kind=kind, project=project, ancestor=ancestor_key) # Optional Pipeline 2: If a read limit was provided, read it and confirm # that the expected entities were read. if known_args.limit is not None: _LOGGER.info( 'Querying a limited set of %s entities and verifying count.', known_args.limit) p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-limit') query.limit = known_args.limit entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([known_args.limit])) p.run() query.limit = None # Pipeline 3: Query the written Entities and verify result. _LOGGER.info('Querying entities, asserting they match.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify') entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([num_entities])) p.run() # Pipeline 4: Delete Entities. _LOGGER.info('Deleting entities.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete') entities = p | 'read from datastore' >> ReadFromDatastore(query) _ = (entities | 'To Keys' >> beam.Map(lambda entity: entity.key) | 'delete entities' >> DeleteFromDatastore(project)) p.run() # Pipeline 5: Query the written Entities, verify no results. _LOGGER.info( 'Querying for the entities to make sure there are none present.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-deleted') entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([0])) p.run()
import argparse import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1new.types import Query parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(None) pipeline_options = PipelineOptions(pipeline_args) project = pipeline_options.view_as(GoogleCloudOptions).project # define the pipeline steps p = beam.Pipeline(options=pipeline_options) data = p | 'Read from Datastore' >> ReadFromDatastore( query=Query('natality-guid', project, limit=5)) scored = data | 'Print' >> beam.Map(print) # run the pipeline result = p.run() result.wait_until_finish()