def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=False, help='Output file to write results to.') parser.add_argument('--project', required=False, help='Project ID for datastore') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: results = ( p # pylint: disable=expression-not-assigned | 'read' >> ReadFromText(known_args.input) | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | 'TopPerPrefix' >> TopPerPrefix(5, "") # | 'format' >> beam.Map( # lambda (prefix, candidates): '%s: %s' % (prefix, candidates)) | 'create entity' >> beam.Map(lambda (prefix, candidates): EntityWrapper( ).make_entity(prefix, candidates)) | 'write to datastore' >> WriteToDatastore(known_args.project) # | 'write' >> WriteToText(known_args.output) )
def run(argv=None): """This function contains the pipeline logic.""" parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) # It is preferable to change the job name between runs. pipeline_args.extend([ '--project=' + project_id, '--job_name=datatransfer' + datetime.now().strftime('%Y%m%d%H%M%S%f'), ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Form an aggregating query. query = """ SELECT CURRENT_DATE() AS date, EXTRACT(HOUR FROM CURRENT_TIME()) AS hour, location, SUM(spend) AS total_spend FROM `my_dataset.stream_data` WHERE EXTRACT(HOUR FROM timestamp) = EXTRACT(HOUR FROM CURRENT_TIME()) GROUP BY date, hour, location """ # Datastore kind of the entities resulting from the query. kind = 'Hourly spend' with beam.Pipeline(options=pipeline_options) as p: (p | 'Read from BigQuery' >> Read(BigQuerySource(project=project_id, query=query, use_standard_sql=True)) | 'Create entity' >> beam.Map(EntityWrapper(kind).make_entity) | 'Write to Datastore' >> WriteToDatastore(project_id))
def read_from_datastore(user_options, pipeline_options): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_ancestor_query(user_options.inputKind, user_options.namespace, user_options.ancestor) # Read entities from Cloud Datastore into a PCollection. lines = p | 'read from datastore' >> ReadFromDatastore( user_options.project, query, user_options.namespace) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) processedTweets = ( lines | 'split' >> (beam.ParDo(processTweet())) | 'create entity' >> beam.Map( EntityWrapper(user_options.namespace, user_options.outputKind, user_options.ancestor).make_entity) | 'write to datastore' >> WriteToDatastore(user_options.project)) result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def dataflow(run_local): if run_local: input_file_path = 'sample.csv' else: input_file_path = 'gs://' + BUCKET + '/' + INPUT_FILENAME JOB_NAME = 'datastore-upload-{}'.format( datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) pipeline_options = { 'project': PROJECT, 'staging_location': 'gs://' + BUCKET + '/staging', 'runner': 'DataflowRunner', 'job_name': JOB_NAME, 'disk_size_gb': 100, 'temp_location': 'gs://' + BUCKET + '/temp', 'save_main_session': True } if run_local: pipeline_options['runner'] = 'DirectRunner' options = PipelineOptions.from_dictionary(pipeline_options) with beam.Pipeline(options=options) as p: (p | 'Reading input file' >> beam.io.ReadFromText(input_file_path) | 'Converting from csv to dict' >> beam.ParDo(CSVtoDict(), [ 'sku', 'name', 'regularPrice', 'salePrice', 'type', 'url', 'image', 'inStoreAvailability' ]) | 'Create entities' >> beam.ParDo(CreateEntities()) | 'Write entities into Datastore' >> WriteToDatastore(PROJECT))
def model_datastoreio(): """Using a Read and Write transform to read/write to Cloud Datastore.""" import uuid from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 import googledatastore import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore project = 'my_project' kind = 'my_kind' query = query_pb2.Query() query.kind.add().name = kind # [START model_datastoreio_read] p = beam.Pipeline(options=PipelineOptions()) entities = p | 'Read From Datastore' >> ReadFromDatastore(project, query) # [END model_datastoreio_read] # [START model_datastoreio_write] p = beam.Pipeline(options=PipelineOptions()) musicians = p | 'Musicians' >> beam.Create( ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) def to_entity(content): entity = entity_pb2.Entity() googledatastore.helper.add_key_path(entity.key, kind, str(uuid.uuid4())) googledatastore.helper.add_properties(entity, {'content': unicode(content)}) return entity entities = musicians | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project)
def main(): args, pipe_args = process_pipe_options() pipe_args.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipe_args) as p: (p | 'Read Similarities' >> beam.io.ReadFromText(args.input) | "Create Entities" >> beam.Map( EntityWrapper(args.kind, args.sim_cap).make_entity) | "Write to DS" >> WriteToDatastore(args.project))
def write_to_datastore(user_options, pipeline_options): """Creates a pipeline that writes entities to Cloud Datastore.""" with beam.Pipeline(options=pipeline_options) as p: # pylint: disable=expression-not-assigned (p | 'read' >> ReadFromText(user_options.input) | 'create entity' >> beam.Map( EntityWrapper(user_options.namespace, user_options.kind, user_options.ancestor).make_entity) | 'write to datastore' >> WriteToDatastore(user_options.dataset))
def write_to_datastore(project, user_options, pipeline_options): """Creates a pipeline that writes entities to Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # pylint: disable=expression-not-assigned (p | 'read' >> ReadFromText(user_options.input) | 'create entity' >> beam.Map( EntityWrapper(user_options.namespace, user_options.kind, user_options.ancestor).make_entity) | 'write to datastore' >> WriteToDatastore(project)) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def dataflow(argv=None): process_options = PipelineOptions().view_as(ProcessOptions) p = beam.Pipeline(options=process_options) (p | 'Read From Text' >> beam.io.ReadFromText(process_options.input, skip_header_lines=0) | 'Process CSV' >> beam.ParDo(ProcessCSV(), ['text', 'label']) | 'Build entities' >> beam.ParDo(BuildEntities(), process_options.entity, process_options.user, process_options.dataset) | 'Write entities into Datastore' >> WriteToDatastore('io-annotator-api')) p.run().wait_until_finish()
def test_DatastoreWriteLargeEntities(self): """100*100kB entities gets split over two Commit RPCs.""" with patch.object(helper, 'get_datastore', return_value=self._mock_datastore): entities = [e.entity for e in fake_datastore.create_entities(100)] datastore_write_fn = _Mutate.DatastoreWriteFn(self._PROJECT) datastore_write_fn.start_bundle() for entity in entities: datastore_helper.add_properties( entity, {'large': u'A' * 100000}, exclude_from_indexes=True) datastore_write_fn.process(WriteToDatastore.to_upsert_mutation(entity)) datastore_write_fn.finish_bundle() self.assertEqual(2, self._mock_datastore.commit.call_count)
def test_DatastoreWriteLargeEntities(self): """100*100kB entities gets split over two Commit RPCs.""" with patch.object(helper, 'get_datastore', return_value=self._mock_datastore): entities = [e.entity for e in fake_datastore.create_entities(100)] datastore_write_fn = _Mutate.DatastoreWriteFn( self._PROJECT, fixed_batch_size=_Mutate._WRITE_BATCH_INITIAL_SIZE) datastore_write_fn.start_bundle() for entity in entities: datastore_helper.add_properties( entity, {'large': u'A' * 100000}, exclude_from_indexes=True) datastore_write_fn.process(WriteToDatastore.to_upsert_mutation(entity)) datastore_write_fn.finish_bundle() self.assertEqual(2, self._mock_datastore.commit.call_count)
def run(pipeline_options, known_args): pipeline = beam.Pipeline(options=pipeline_options) gcp_project = pipeline_options.get_all_options()['project'] with impl.Context(known_args.transform_temp_dir): articles = (pipeline | 'Read articles from BigQuery' >> beam.io.Read( beam.io.BigQuerySource(project=gcp_project, query=get_source_query( known_args.limit), use_standard_sql=True))) articles_dataset = (articles, get_metadata()) embeddings_dataset, _ = ( articles_dataset | 'Extract embeddings' >> impl.AnalyzeAndTransformDataset(preprocess_fn)) embeddings, transformed_metadata = embeddings_dataset embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix='{0}'.format(known_args.output_dir), file_name_suffix='.tfrecords', coder=tft_coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema), num_shards=int(known_args.limit / 25000)) (articles | 'Convert to entity' >> beam.Map(lambda input_features: create_entity( input_features, known_args.kind)) | 'Write to Datastore' >> WriteToDatastore(project=gcp_project)) if known_args.enable_debug: embeddings | 'Debug Output' >> beam.io.textio.WriteToText( file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt') job = pipeline.run() if pipeline_options.get_all_options()['runner'] == 'DirectRunner': job.wait_until_finish()
def dataflow(run_local): JOB_NAME = 'firestore-upload-{}'.format( datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) pipeline_options = { 'project': PROJECT, 'staging_location': 'gs://' + BUCKET + '/staging', 'runner': 'DataflowRunner', 'job_name': JOB_NAME, 'disk_size_gb': 100, 'temp_location': 'gs://' + BUCKET + '/staging', 'save_main_session': True, 'requirements_file': 'requirements.txt' } if run_local: pipeline_options['runner'] = 'DirectRunner' options = PipelineOptions.from_dictionary(pipeline_options) with beam.Pipeline(options=options) as p: (p | 'Reading input file' >> beam.Create([1]) | 'Create entities' >> beam.ParDo(CreateEntities()) | 'Write entities into Datastore' >> WriteToDatastore(PROJECT))
fields = line.split( ',') #id,president,startYear,endYear,party,homeState,dateOfBirth id = fields[0] president = fields[1] names = president.split(' ') firstName = names[0] lastName = names[1] startYear = fields[2] endYear = fields[3] party = fields[4] homeState = fields[5] dateOfBirth = fields[6] googledatastore.helper.add_key_path(entity.key, kind, str(id)) googledatastore.helper.add_properties( entity, { 'firstName': unicode(firstName), 'lastName': unicode(lastName), 'startYear': int(startYear), 'endYear': int(endYear), 'party': unicode(party), 'homeState': unicode(homeState), 'dateOfBirth': datetime.strptime(dateOfBirth, '%Y-%m-%d') }) return entity entities = lines | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project) # lines | 'Write to Cloud Storage' >> beam.io.WriteToText('gs://[GCLOUD_BUCKET]/out') p.run().wait_until_finish()
key = element['name'].upper() while key != '': for i in range(2, len(key) + 1): result.append(new_elm(element, key[0:i])) key = re.sub('\\S+\\s*\\W*', '', key, count=1) return result class EntityWrapper(object): def __init__(self, namespace, kind): self._namespace = namespace self._kind = kind def make_entity(self, content): entity = entity_pb2.Entity() if self._namespace is not None: entity.key.partition_id.namespace_id = self._namespace helper.add_key_path(entity.key, self._kind, str(uuid.uuid4())) helper.add_properties(entity, content) return entity p = beam.Pipeline(options=options) (p | 'query from bq' >> beam.io.Read( beam.io.BigQuerySource(query="select * from bestbuy.products")) | 'generate key' >> beam.FlatMap(gen_key) | 'make entry' >> beam.Map(EntityWrapper(None, 'products3').make_entity) | WriteToDatastore("sample-datalab")) p.run()
def process_datastore_tweets(project, pipeline_options): """Creates a pipeline that reads tweets from Cloud Datastore from the last N days. The pipeline finds the top most-used words, the top most-tweeted URLs, ranks word co-occurrences by an 'interestingness' metric (similar to on tf* idf). """ user_options = pipeline_options.view_as(UserOptions) hours = 20 p = beam.Pipeline(options=pipeline_options) # Read entities from Cloud Datastore into a PCollection, then filter to get # only the entities from the last DAYS days. lines = (p | QueryDatastore(project, hours) | beam.ParDo(FilterDate(user_options, hours))) # Process the tweet. processedTweets = (lines | 'processTweets' >> (beam.ParDo(processTweet(user_options)))) # Define some inline helper functions. def join_cinfo(cooccur, percents): """Calculate a co-occurence ranking.""" import math word1 = cooccur[0][0] word2 = cooccur[0][1] try: word1_percent = percents[word1] weight1 = 1 / word1_percent word2_percent = percents[word2] weight2 = 1 / word2_percent return (cooccur[0], cooccur[1], cooccur[1] * math.log(min(weight1, weight2))) except: return 0 def generate_cooccur_schema(): """BigQuery schema for the word co-occurrence table.""" json_str = json.dumps({ 'fields': [{ 'name': 'w1', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'w2', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'log_weight', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) # {'name': 'ts', 'type': 'STRING', 'mode': 'NULLABLE'}]}) return parse_table_schema_from_json(json_str) def generate_url_schema(): """BigQuery schema for the urls count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) # {'name': 'ts', 'type': 'STRING', 'mode': 'NULLABLE'}]}) return parse_table_schema_from_json(json_str) def generate_wc_schema(): """BigQuery schema for the word count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'word', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'percent', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) # {'name': 'ts', 'type': 'STRING', 'mode': 'NULLABLE'}]}) return parse_table_schema_from_json(json_str) # Write the results to three BigQuery tables. (processedTweets | 'create entity' >> beam.Map( EntityWrapper("", "processedTweets", "root").make_entity) | 'processed tweet write' >> WriteToDatastore(project)) # Actually run the pipeline. return p.run()
def run(): argv = [ '--project={0}'.format(PROJECT), '--job_name=shq-demo-data-{}'.format( datetime.now().strftime('%Y%m%d%H%M%S')), '--save_main_session', '--requirements_file=requirements.txt', '--staging_location=gs://{0}/staging/'.format(BUCKET), '--temp_location=gs://{0}/staging/'.format(BUCKET), '--runner=DataflowRunner' ] # create the pipeline p = beam.Pipeline(argv=argv) # get pcollection of users # read rows (dicts) from BQ # convert offset into actual date relative to today users = ( p | 'read users from BQ' >> beam.io.Read( beam.io.BigQuerySource( query= 'SELECT * FROM [success-hq:datastore.user] order by email {}'. format(USER_LIMIT))) | 'get users with reg dates' >> beam.Map(get_user_with_regdate)) # create list of companies and reg dates based on earliest user reg_date companies = (users | 'get company and reg date from user' >> beam.Map(get_company_and_regdate) | 'find first reg_date for company' >> beam.CombinePerKey(min)) # convert rows into datastore entities # write entities into datastore (users | 'build user entity' >> beam.Map(build_user_entity) | 'write user to Datastore' >> WriteToDatastore(PROJECT)) # convert into datastore entities # write entities into datastore (companies | 'build company entity' >> beam.Map(build_company_entity) | 'write company to Datastore' >> WriteToDatastore(PROJECT)) # create projects in datastore (companies | 'create project for company' >> beam.Map(build_project_entities) | 'write project to Datastore' >> WriteToDatastore(PROJECT)) # create trending in datastore (companies | 'create trending for company' >> beam.Map(build_trending_entities) | 'write trending to Datastore' >> WriteToDatastore(PROJECT)) # create events for company company_events = ( companies | 'build company events' >> beam.FlatMap(build_company_events) | 'expand company events' >> beam.FlatMap(expand_events)) # write company events into BQ (company_events | 'write to BQ table' >> beam.io.Write( beam.io.BigQuerySink( project=PROJECT, dataset=DATASET, table='company_events', write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) # find purchases for all companies purchases = (company_events | 'get purchased amounts' >> beam.FlatMap(get_purchased_amounts) | 'sum purchased amounts' >> beam.CombinePerKey(sum)) # find provisions for all companies provisions = ( company_events | 'get provisioned amounts' >> beam.FlatMap(get_provisioned_amounts) | 'sum provisioned amounts' >> beam.CombinePerKey(sum)) # combine purchase and provision pcollections company_updates = { 'purchased': purchases, 'provisioned': provisions } | beam.CoGroupByKey() # write renewal records to datastore (company_updates | 'create renewal for company' >> beam.Map(build_renewal_entities) | 'write renewals to Datastore' >> WriteToDatastore(PROJECT)) # create registration events for users reg_events = users | 'build reg events' >> beam.Map(build_reg_event) # create tickets events for users ticket_events = users | 'build ticket events' >> beam.FlatMap( lambda line: build_ticket_events(line)) # create call events for users call_events = (users | 'build call events' >> beam.FlatMap(build_call_events) | 'expand call events' >> beam.FlatMap(expand_events)) # combine the pcollections events = (reg_events, ticket_events, call_events) | beam.Flatten() # take daily collections and write them into bq (events | 'write to bq' >> beam.io.Write( beam.io.BigQuerySink( '{}:{}.{}'.format(PROJECT, DATASET, TEMP), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))) # run the pipeline print 'waiting for pipeline to finish, bq partition still to come' print 'do not close cloud shell window' status = p.run().wait_until_finish() # copy stuff from temp into partitions print 'starting bq partition work' today = date.today() days_past = 182 bq_client = bigquery.Client(project=PROJECT) bq_dataset = bq_client.dataset(DATASET) for index in range(0, days_past): query_day = (datetime.now() + timedelta(days=1 - index)).date() query_start = query_day.strftime('%Y-%m-%d 00:00:00') query_end = query_day.strftime('%Y-%m-%d 23:59:59') part_string = query_day.strftime('%Y%m%d') query = 'SELECT * FROM {}.{} where date >= "{}" and date <= "{}"'.format( DATASET, TEMP, query_start, query_end) bq_target = bq_dataset.table('user_events${}'.format(part_string)) job = bq_client.run_async_query( 'bq_load_{}'.format(datetime.now().strftime('%Y%m%d%H%M%S%f')), query) job.destination = bq_target job.write_disposition = 'WRITE_TRUNCATE' job.begin() print 'Done! You can close the Cloud Shell window'
def run(argv=None): """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument('--kind', dest='kind', default='writereadtest', help='Datastore Kind') parser.add_argument('--num_entities', dest='num_entities', type=int, required=True, help='Number of entities to write') parser.add_argument('--limit', dest='limit', type=int, help='Limit of number of entities to write') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) job_name = gcloud_options.job_name kind = known_args.kind num_entities = known_args.num_entities project = gcloud_options.project # a random ancesor key ancestor = str(uuid.uuid4()) query = make_ancestor_query(kind, None, ancestor) # Pipeline 1: Create and write the specified number of Entities to the # Cloud Datastore. logging.info('Writing %s entities to %s', num_entities, project) p = new_pipeline_with_job_name(pipeline_options, job_name, '-write') # pylint: disable=expression-not-assigned (p | 'Input' >> beam.Create(list(range(known_args.num_entities))) | 'To String' >> beam.Map(str) | 'To Entity' >> beam.Map(EntityWrapper(kind, None, ancestor).make_entity) | 'Write to Datastore' >> WriteToDatastore(project)) p.run() # Optional Pipeline 2: If a read limit was provided, read it and confirm # that the expected entities were read. if known_args.limit is not None: logging.info( 'Querying a limited set of %s entities and verifying count.', known_args.limit) p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-limit') query_with_limit = query_pb2.Query() query_with_limit.CopyFrom(query) query_with_limit.limit.value = known_args.limit entities = p | 'read from datastore' >> ReadFromDatastore( project, query_with_limit) assert_that(entities | beam.combiners.Count.Globally(), equal_to([known_args.limit])) p.run() # Pipeline 3: Query the written Entities and verify result. logging.info('Querying entities, asserting they match.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([num_entities])) p.run() # Pipeline 4: Delete Entities. logging.info('Deleting entities.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) # pylint: disable=expression-not-assigned (entities | 'To Keys' >> beam.Map(lambda entity: entity.key) | 'Delete keys' >> DeleteFromDatastore(project)) p.run() # Pipeline 5: Query the written Entities, verify no results. logging.info( 'Querying for the entities to make sure there are none present.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-deleted') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([0])) p.run()
def run(argv=None): pipeline_args = [ '--project={0}'.format(PROJECT), '--job_name=majesticmillion1', '--save_main_session', '--staging_location=gs://{0}/staging/'.format(BUCKET), '--temp_location=gs://{0}/temp/'.format(BUCKET), '--num_workers=4', '--runner=DataflowRunner', '--inputFile=gs://{0}/Sample_Data/majestic_million.csv'.format(BUCKET), '--template_location=gs://{0}/templates/majestic_million_template'. format(BUCKET), '--zone=australia-southeast1-a' # '--region=australia-southeast1', ] pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True inbound_options = pipeline_options.view_as(FileLoader) input = inbound_options.inputFile with beam.Pipeline(options=pipeline_options) as p: TLD_Desc = ( p | 'Read TLD Description File' >> beam.io.ReadFromText(TLDFile) | 'Parse Descriptions' >> beam.ParDo(combine_TLD()) | 'Combine Descriptions to Dictionary' >> beam.CombineGlobally(combine_pdict)) excludedTLDs = ( p | 'Read excuded TLD file' >> beam.io.ReadFromText(excludedTLDFile) | 'Get list of excluded TLD' >> beam.ParDo(lambda x: x.split(','))) # Extract records as dictionaries records = ( p | 'Read File' >> beam.io.ReadFromText(input, skip_header_lines=1) | 'Parse CSV' >> beam.ParDo(Split(), SCHEMA) | 'Add Descriptions' >> beam.ParDo( AddDTLDDesc(), beam.pvalue.AsSingleton(TLD_Desc))) # Write TLD aggregations to BigQuery (records | 'Aggregate TLDS' >> CountTLDs(excludedTLDs) | 'Write TLDs to BigQuery' >> beam.io.WriteToBigQuery( '{0}:{1}.TLDCounts'.format(PROJECT, DATASET), # Enter your table name schema=TLD_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) # Write all records to BigQuery (records | 'Write Items BQ' >> beam.io.WriteToBigQuery( '{0}:{1}.TopSites'.format(PROJECT, DATASET), # Enter your table name schema=SCHEMA + "," + DESCRIPTIONSCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) # Write metadata to Datastore (records | 'Get Record Count' >> beam.combiners.Count.Globally() | 'Create Metadata' >> beam.ParDo( GetMetaData(inbound_options.inputFile)) | 'Create DS Entity' >> beam.Map(lambda x: create_ds_entity(x)) | 'Write To DS' >> WriteToDatastore(PROJECT)) p.run()