def run(argv=None): """Construct the pipeline.""" options = ImportAssetOptions(argv) p = beam.Pipeline(options=options) # Cleanup json documents. sanitized = (p | 'read' >> ReadFromText(options.input, coder=JsonCoder()) | 'produce_resource_json' >> beam.ParDo( ProduceResourceJson(options.group_by)) | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize())) # Joining all iam_policy objects with resources of the same name. merged_iam = (sanitized | 'assign_name_key' >> beam.ParDo( AssignGroupByKey('NAME', options.num_shards)) | 'group_by_name' >> beam.GroupByKey() | 'combine_policy' >> beam.ParDo(CombinePolicyResource())) # split into BigQuery tables. keyed_assets = merged_iam | 'assign_group_by_key' >> beam.ParDo( AssignGroupByKey(options.group_by, options.num_shards)) # Generate BigQuery schema for each table. schemas = keyed_assets | 'to_schema' >> core.CombinePerKey( BigQuerySchemaCombineFn()) pvalue_schemas = beam.pvalue.AsDict(schemas) # Write to GCS and load to BigQuery. # pylint: disable=expression-not-assigned (keyed_assets | 'add_load_time' >> beam.ParDo(AddLoadTime(options.load_time)) | 'group_by_key_before_enforce' >> beam.GroupByKey() | 'enforce_schema' >> beam.ParDo(EnforceSchemaDataTypes(), pvalue_schemas) | 'group_by_key_before_write' >> beam.GroupByKey() | 'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time)) | 'group_written_objects_by_key' >> beam.GroupByKey() | 'delete_tables' >> beam.ParDo( DeleteDataSetTables(options.dataset, options.write_disposition)) | 'load_to_bigquery' >> beam.ParDo( LoadToBigQuery(options.dataset, options.load_time), beam.pvalue.AsDict(schemas))) return p.run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() empty_line_values = result.aggregated_values(empty_line_aggregator) logging.info('number of empty lines: %d', sum(empty_line_values.values())) word_length_values = result.aggregated_values(average_word_size_aggregator) logging.info('average word lengths: %s', word_length_values.values())
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = (p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map( lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse' >> beam.ParDo(ParseEventFn())) # [START EXERCISE 6] _ = (events | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) # Extract sessions of user data, using known_args.session_gap as the # gap duration. # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions | 'sessionize' >> ChangeMe() | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) # Re-window into fixed windows of size user_activity_window in order # to compute the mean session duration for that window of activity. | 'window_of_sessions' >> ChangeMe() | 'session_mean' >> ChangeMe() # [END EXERCISE 6] | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) | 'write_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, SESSION_SCHEMA) ) p.run().wait_until_finish()
def run(argv=None): """Construct the pipeline.""" options = ImportAssetOptions(argv) p = beam.Pipeline(options=options) # Delete bigquery dataset on pipeline start. deleted_tables = ( p | beam.Create([None]) # dummy PCollection to trigger delete tables. | 'delete_tables' >> beam.ParDo( DeleteDataSetTables(options.dataset, options.write_disposition))) # Cleanup json documents. sanitized_assets = ( p | 'read' >> ReadFromText(options.input, coder=JsonCoder()) | 'bigquery_sanitize' >> beam.ParDo(BigQuerySanitize(options.load_time))) # Joining all iam_policy objects with resources of the same name. merged_iam_and_asset = ( sanitized_assets | 'name_key' >> beam.ParDo(AssignGroupByKey('NAME')) | 'group_by_name' >> beam.GroupByKey() | 'combine_policy' >> beam.ParDo(CombinePolicyResource())) # split into BigQuery tables. keyed_assets = merged_iam_and_asset | 'group_by_key' >> beam.ParDo( AssignGroupByKey(options.group_by)) # Generate BigQuery schema for each table. schemas = keyed_assets | 'to_schema' >> core.CombinePerKey( BigQuerySchemaCombineFn()) # Write to GCS and load to BigQuery. # pylint: disable=expression-not-assigned (keyed_assets | 'group_assets_by_key' >> beam.GroupByKey() | 'write_to_gcs' >> beam.ParDo(WriteToGCS(options.stage, options.load_time)) | 'group_written_objets_by_key' >> beam.GroupByKey() | 'load_to_bigquery' >> beam.ParDo( LoadToBigQuery(options.dataset, options.write_disposition, options.load_time), beam.pvalue.AsDict(schemas), beam.pvalue.AsSingleton(deleted_tables))) return p.run()
def run(argv=None): p = beam.Pipeline(options=PipelineOptions()) class Printer(beam.DoFn): def process(self, element): print element class Transaction(beam.DoFn): def process(self, element): #tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,t1,t2,t3,t4,t5 = element.split(',') t=[] t=element.split(',') print(len(t)) if t[0]!='tripduration' and len(t)==15: # just to avoid the problems caused by the csv table header #return [{"tripduration": tripduration,"starttime": starttime,"stoptime" : stoptime,"start_station_id" : start_station_id,"start_station_name" : start_station_name,"start_station_latitude" : start_station_latitude,"start_station_longitude" : start_station_longitude,"end_station_id" : end_station_id,"end_station_name" : end_station_name,"end_station_latitude" : end_station_latitude,"end_station_longitude": end_station_longitude ,"bikeid": bikeid,"usertype": usertype,"birth_year": birth_year,"gender": gender}] #return [{"tripduration": t[0],"starttime": t[1],"stoptime" : t[2],"start_station_id" : t[3],"start_station_name" : t[4],"start_station_latitude" : t[5],"start_station_longitude" : t[6],"end_station_id" : t[7],"end_station_name" : t[8],"end_station_latitude" : t[9],"end_station_longitude": t[10] ,"bikeid": t[11],"usertype": t[12],"birth_year": t[13],"gender": t[14]}] return[{"birth_year": int(t[13]),"birth_year_double": int(t[13])*2,"gender": t[14],"gender_reverse": t[14][::-1]}] data_from_source = (p | 'Read the source file' >> ReadFromText('gs://group-2-ross/dataset.csv') | 'Clean the items' >> beam.ParDo(Transaction()) ) project_id = "pe-training" # replace with your project ID dataset_id = 'rstar' # replace with your dataset ID table_id = 'data1' # replace with your table ID #table_schema = ('tripduration:INTEGER,starttime:TIMESTAMP,stoptime:TIMESTAMP,start_station_id:INTEGER,start_station_name:STRING ,start_station_latitude:FLOAT,start_station_longitude:FLOAT,end_station_id:INTEGER,end_station_name:STRING,end_station_latitude:FLOAT,end_station_longitude:FLOAT,bikeid:INTEGER,usertype:STRING,birth_year:INTEGER ,gender:STRING') table_schema = ('birth_year:INTEGER,birth_year_double:INTEGER,gender:STRING,gender_reverse:STRING') # Persist to BigQuery # WriteToBigQuery accepts the data as list of JSON objects data_from_source | 'Write' >> beam.io.WriteToBigQuery( table=table_id, dataset=dataset_id, project=project_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, batch_size=int(100) ) result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # Import this here to avoid pickling the main session. import re # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) words = ( lines | 'Split' >> beam.FlatMap( lambda line: re.findall(r'[\w]+', line)).with_output_types(str) # Map to Row objects to generate a schema suitable for conversion # to a dataframe. | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word))) df = to_dataframe(words) df['count'] = 1 counted = df.groupby('word').sum() counted.to_csv(known_args.output) # Deferred DataFrames can also be converted back to schema'd PCollections counted_pc = to_pcollection(counted, include_indexes=True) # Print out every word that occurred >50 times _ = (counted_pc | beam.Filter(lambda row: row.count > 50) | beam.Map(lambda row: f'{row.word}: {row.count}') | beam.Map(print))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "--input", dest="input", default= "gs://airflow-training-data/land_registry_price_paid_uk/*/*.json", help="Input file to process.", ) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ "--runner=DataflowRunner", "--project=gdd-airflow-training", "--staging_location=gs://airflow-training-data/dataflow-staging", "--temp_location=gs://airflow-training-data/dataflow-temp", "--job_name=gcs-gzcomp-to-bq1", ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: (p | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder()) | WriteToBigQuery( "result_table", dataset="result_dataset", project="gdd-airflow-training", schema="city:string, " "county:string, " "district:string, " "duration:string, " "locality:string, " "newly_built:boolean, " "paon:string, " "postcode:string, " "ppd_category_type:string, " "price:numeric, " "property_type:string, " "record_status:string, " "saon:string, " "street:string, " "transaction:string, " "transfer_date:numeric", create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, ))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= 'gs://{}_computronix/contractors/{}/{}/{}_contractors_licenses.json'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default= 'gs://{}_computronix/contractors/avro_output/{}/{}/{}/avro_output'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('computronix-trades-dataflow_scripts', '{}_computronix'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('contractors_computronix') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(FormatColumnNames()) | beam.ParDo(ConvertTypes()) | beam.io.avroio.WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def examples_wordcount_templated(renames): """Templated WordCount example snippet.""" import re import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io import WriteToText from apache_beam.options.pipeline_options import PipelineOptions # [START example_wordcount_templated] class WordcountTemplatedOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): # Use add_value_provider_argument for arguments to be templatable # Use add_argument as usual for non-templatable arguments parser.add_value_provider_argument( '--input', help='Path of the file to read from') parser.add_argument( '--output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(['--output', 'some/output_path']) with beam.Pipeline(options=pipeline_options) as p: wordcount_options = pipeline_options.view_as(WordcountTemplatedOptions) lines = p | 'Read' >> ReadFromText(wordcount_options.input) # [END example_wordcount_templated] def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) ( lines | 'ExtractWords' >> beam.FlatMap( lambda x: re.findall(r'[A-Za-z\']+', x)) | 'PairWithOnes' >> beam.Map(lambda x: (x, 1)) | 'Group' >> beam.GroupByKey() | 'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))) | 'Format' >> beam.Map(format_result) | 'Write' >> WriteToText(wordcount_options.output) ) p.visit(SnippetUtils.RenameFiles(renames))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "--input", dest="input", help="Input file to process.", ) parser.add_argument( "--table", dest="table", help="Destination BigQuery table", ) parser.add_argument( "--dataset", dest="dataset", help="Destination BigQuery dataset", ) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: ( p | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder()) | WriteToBigQuery( known_args.table, dataset=known_args.dataset, schema="city:string, " "county:string, " "district:string, " "duration:string, " "locality:string, " "newly_built:boolean, " "paon:string, " "postcode:string, " "ppd_category_type:string, " "price:numeric, " "property_type:string, " "record_status:string, " "saon:string, " "street:string, " "transaction:string, " "transfer_date:numeric", create_disposition=BigQueryDisposition.CREATE_IF_NEEDED) )
def run(archivo, mifecha): gcs_path = "gs://ct-sensus" #Definicion de la raiz del bucket gcs_project = "contento-bi" mi_runer = ("DirectRunner", "DataflowRunner")[socket.gethostname() == "contentobi"] pipeline = beam.Pipeline( runner=mi_runer, argv=[ "--project", gcs_project, "--staging_location", ("%s/dataflow_files/staging_location" % gcs_path), "--temp_location", ("%s/dataflow_files/temp" % gcs_path), "--output", ("%s/dataflow_files/output" % gcs_path), "--setup_file", "./setup.py", "--max_num_workers", "5", "--subnetwork", "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1" # "--num_workers", "30", # "--autoscaling_algorithm", "NONE" ]) # lines = pipeline | 'Lectura de Archivo' >> ReadFromText("gs://ct-bancolombia/info-segumiento/BANCOLOMBIA_INF_SEG_20181206 1100.csv", skip_header_lines=1) #lines = pipeline | 'Lectura de Archivo' >> ReadFromText("gs://ct-bancolombia/info-segumiento/BANCOLOMBIA_INF_SEG_20181129 0800.csv", skip_header_lines=1) lines = pipeline | 'Lectura de Archivo' >> ReadFromText( archivo, skip_header_lines=1) transformed = (lines | 'Formatear Data' >> beam.ParDo(formatearData(mifecha))) # lines | 'Escribir en Archivo' >> WriteToText("archivos/Info_carga_banco_prej_small", file_name_suffix='.csv',shard_name_template='') # transformed | 'Escribir en Archivo' >> WriteToText("archivos/Info_carga_banco_seg", file_name_suffix='.csv',shard_name_template='') #transformed | 'Escribir en Archivo' >> WriteToText("gs://ct-bancolombia/info-segumiento/info_carga_banco_seg",file_name_suffix='.csv',shard_name_template='') transformed | 'Escritura a BigQuery lal' >> beam.io.WriteToBigQuery( gcs_project + ":sensus.dcorreo", schema=TABLE_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) # transformed | 'Borrar Archivo' >> FileSystems.delete('gs://ct-avon/prejuridico/AVON_INF_PREJ_20181111.TXT') # 'Eliminar' >> FileSystems.delete (["archivos/Info_carga_avon.1.txt"]) jobObject = pipeline.run() # jobID = jobObject.job_id() return ("Corrio Full HD")
def run(argv=None): """Runs the debugging wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection, count the occurrences of # each word and filter by a list of words. filtered_words = ( p | 'read' >> ReadFromText(known_args.input) | CountWords() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # assert_that is a convenient PTransform that checks a PCollection has an # expected value. Asserts are best used in unit tests with small data sets # but is demonstrated here as a teaching tool. # # Note assert_that does not provide any output and that successful # completion of the Pipeline implies that the expectations were met. Learn # more at https://cloud.google.com/dataflow/pipelines/testing-your-pipeline # on how to best test your pipeline. assert_that(filtered_words, equal_to([('Flourish', 3), ('stomach', 1)])) # Format the counts into a PCollection of strings and write the output using # a "Write" transform that has side effects. # pylint: disable=unused-variable output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'write' >> WriteToText(known_args.output))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='computronix-trades-dataflow', bucket='{}_computronix'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='trade_licenses_computronix') with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(FormatColumnNames()) | beam.ParDo(ConvertTypes()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """Main entry point; defines and runs the entity extraction pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input JSON to read') parser.add_argument('--output', required=True, help='Output file to write results to') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: records = p | 'ReadRecords' >> ReadFromText(known_args.input) entities = records | 'ExtractEntities' >> beam.ParDo( EntityExtraction()) entities | 'WriteEntities' >> WriteToText( known_args.output, file_name_suffix='entities.json.gz')
def run(argv=None): # argument parser parser = argparse.ArgumentParser() # pipeline options, google_cloud_options known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True p = beam.Pipeline(options=pipeline_options) rec = p | "read from GCS" >> ReadFromText("gs://bucket_name/folder_path/file*.csv") \ | "transform" >> beam.Map(lambda x: x) \ | "write to GCS" >> WriteToText(known_args.output_path) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: (p | 'read' >> ReadFromText(known_args.input) | 'copy' >> WriteToText(known_args.output))
def main(input: str, table: str, project: str, pipeline_args): pipeline_args = list(pipeline_args) pipeline_args.extend([ "--runner=DataflowRunner", f"--project={project}", "--temp_location=gs://recsys2020-challenge-wantedly/temp", "--staging_location=gs://recsys2020-challenge-wantedly/staging", ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(input) rows = lines | "Parse" >> beam.ParDo(ParseFn(COLUMNS)) rows | beam.io.gcp.bigquery.WriteToBigQuery( table, schema=schema(), project=project, )
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input file to process.', default='../input/smallMat.json') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--runner', dest='runner', default='DirectRunner', help='Runner class to use (DataflowRunner for GCP).') parser.add_argument('--project', dest='project', help='GCP project for Cloud Dataflow') parser.add_argument( '--num-results', dest='num_results', default=1000, type=int, help='Number of results top results to keep per column.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner={}'.format(known_args.runner), '--project={}'.format(known_args.project), '--staging_location=gs://beam-matmul/staging', '--temp_location=gs://beam-matmul/temp', '--job_name=matmul-side-input', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: files = p | 'Read ls List' >> ReadFromText(known_args.input) mat = files | 'Parse Columns' >> beam.Map(json.loads) (mat | 'Calc Scores' >> beam.Map(calc_scores, beam.pvalue.AsIter(mat), known_args.num_results) | 'Serialize as JSON' >> beam.Map(json.dumps) | 'Write Scores' >> WriteToText(known_args.output))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-kga369-207722/twitter_data_3.txt', help='Input file to process.') parser.add_argument('--output', dest='output', default='$BUCKET/output', help='kga369-207722:MindValley_Project.Twitter') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DirectRunner', '--project=$PROJECT', '--staging_location=$BUCKET/staging', '--temp_location=$BUCKET/temp', '--job_name=your-wordcount-job', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[#]+[A-Za-z\']+', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(known_args.output)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(known_args.output)
def run_wordcount_without_save_main_session(argv): """Defines and runs a simple version of wordcount pipeline. This pipeline is the same as wordcount example except replace customized DoFn class with transform function and disable save_main_session option due to BEAM-6158.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions(pipeline_args)) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) # Parse each line of input text into words. def extract_words(line): return re.findall(r'[\w\']+', line, re.UNICODE) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) # pylint: disable=expression-not-assigned (p | 'read' >> ReadFromText(known_args.input) | 'split' >> (beam.ParDo(extract_words).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones) | 'format' >> beam.Map(format_result) | 'write' >> WriteToText(known_args.output)) result = p.run() result.wait_until_finish()
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_finance/{}/{}/{}_registered_businesses.csv'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_finance/avro_output/{}/{}/{}/avro_output'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('registered-businesses-dataflow_scripts', '{}_finance'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('registered_businesses') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, skip_header_lines=1) load = (lines | beam.ParDo(ConvertToDicts()) | beam.ParDo(AddNormalizedAddress()) | beam.io.avroio.WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session # The pipeline will be run on exiting the with block. with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) counts = ( lines | 'Split Words' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'Pair With One' >> beam.Map(lambda x: (x, 1)) | 'Group And Sum' >> beam.CombinePerKey(sum) | 'Find Top 3 Most Frequent Words' >> beam.CombineGlobally( beam.combiners.TopCombineFn( n=3, compare=lambda a, b: a[1] < b[1])).without_defaults()) # Format the counts into a PCollection of strings. def format_result(word, count): return '%s: %d' % (word, count) #output = counts | 'Format' >> beam.MapTuple(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output = counts | 'Write' >> WriteToText(known_args.output)
def run(argv=None): """Main entry point; defines and runs the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output', dest='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: (p # pylint: disable=expression-not-assigned | 'read' >> ReadFromText(known_args.input, coder=JsonCoder()) | 'compute' >> beam.FlatMap(compute) | 'write' >> WriteToText(known_args.output, shard_name_template=''))
def run(argv=None): p = beam.Pipeline(options=PipelineOptions()) class Printer(beam.DoFn): def process(self, element): print element class Transaction(beam.DoFn): def process(self, element): t=[] t=element.split(',') if t[0]!='Place': return [{"Place": t[0],"Gender": t[1],"Year" : t[2],"Name" : t[3][::-1],"Number" : int(t[4])*22}] data_from_source = (p | 'Read the source file' >> ReadFromText('gs://group4-bucket/Group4Data1.csv') | 'Clean the items' >> beam.ParDo(Transaction()) ) project_id = "pe-training" # replace with your project ID dataset_id = 'group4dataset1' # replace with your dataset ID table_id = 'ayushtable' # replace with your table ID table_schema = ('Place:STRING,Gender:STRING,Year:INTEGER,Name:STRING,Number:INTEGER') # Persist to BigQuery # WriteToBigQuery accepts the data as list of JSON objects data_from_source | 'Write' >> beam.io.WriteToBigQuery( table=table_id, dataset=dataset_id, project=project_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, batch_size=int(100) ) result = p.run() result.wait_until_finish()
def run(): pipeline = beam.Pipeline() # Read lines from input file. lines = pipeline | 'read' >> ReadFromText('data/king_arthur.txt') # Count words. counts = (lines | 'split' >> beam.FlatMap( lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(unicode) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format and write to output file. output = counts | 'format' >> beam.Map( lambda (word, count): '{}: {}'.format(word, count)) output | 'write' >> WriteToText('minimal-wordcount-output', '.txt') pipeline.run().wait_until_finish()
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) window_duration = 1 * 60 # 1 minute windows. if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = ( p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub(topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'decode' >> beam.ParDo(ParseEventFn())) # Window team scores and write them BigQuery. _ = (events | 'windowed_team_score' >> WindowedTeamScore(window_duration) | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) | 'write_teams_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename + '_team', known_args.output_dataset, project, TEAM_SCHEMA)) # Write leaderboards to BigQuery. _ = (events | 'running_user_score' >> RunningUserScores() | 'format_user_scores' >> beam.ParDo(FormatUserScoreSum()) | 'write_users_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename + '_user', known_args.output_dataset, project, USER_SCHEMA)) p.run().wait_until_finish()
def main(argv=None): # options = PipelineOptions(flags=argv) # review_processing_options = options.view_as(TemplateReviewProcessingOptions) # #installing packages used in process # setup_options = options.view_as(SetupOptions) # setup_options.save_main_session = True # pipeline.run(options, review_processing_options) options = PipelineOptions(flags=argv) setup_options = options.view_as(SetupOptions) # setup_options.setup_file = './setup.py' # setup_options.save_main_session = False wordcount_options = options.view_as(WordcountOptions) with beam.Pipeline(options=setup_options) as p: lines = p | 'read' >> ReadFromText(wordcount_options.input)
def run(input_path, output_path, expansion_service_port, pipeline_args): pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: lines = p | 'Read' >> ReadFromText(input_path).with_output_types(str) words = lines | 'Split' >> (beam.ParDo( WordExtractingDoFn()).with_output_types(str)) java_output = (words | 'JavaCount' >> beam.ExternalTransform( 'beam:transform:org.apache.beam:javacount:v1', None, ('localhost:%s' % expansion_service_port))) def format(kv): key, value = kv return '%s:%s' % (key, value) output = java_output | 'Format' >> beam.Map(format) output | 'Write' >> WriteToText(output_path)
def run(argv=None): """pipeline abandoned-carts""" """Definicao argumentos de entrada do pipeline""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Arquivo de entrada para o processamento.') parser.add_argument( '--output', dest='output', required=True, help='Arquivo de saida com o resultado do processamento.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) """Le arquivo""" lines = p | 'le_arquivo' >> ReadFromText(known_args.input, coder=JsonCoder()) """Filtra todos os clientes que fizeram checkout, e traz o customer como chave""" filtros_checkout = ( lines | 'filtra_pagina_checkout' >> beam.Filter(lambda x: (x['page'] == 'checkout')) | 'gera_chave_customer' >> beam.Map(lambda x: (x['customer'], x))) """Traz o customer como chave para todos os registros""" filtros_todos = ( lines | 'gera_chave_customer_geral' >> beam.Map(lambda x: (x['customer'], x))) """Faz o left join trazendo todos os registros que nao tem checkout""" results = ((filtros_todos, filtros_checkout) | 'left_join' >> beam.CoGroupByKey() | 'filtro_sem_checkout' >> beam.Filter(lambda x: not (x[1][1])) | 'formata_saida' >> beam.Map(lambda x: (x[1][0][-1]))) """Grava o resultado num arquivo json""" results | 'grava_resultado' >> WriteToText( known_args.output, coder=JsonCoder(), file_name_suffix='.json') result_run = p.run() result_run.wait_until_finish()