def run(): PROJECT_ID = 'electric-spark-266716' # change to your project id BUCKET = 'gs://global_surface_temperatures' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'country-beam-dataflow' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) #create query to select all elements for cleansing sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, Country \ FROM kaggle_modeled.Country as x' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) #read desired table from BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) #write inputs to input.txt query_results | 'Write input' >> WriteToText(DIR_PATH + 'input_country.txt') # apply ParDo to filter out dates formatted_country_pcoll = query_results | 'Filter Dates' >> beam.ParDo( FilterDateFn()) # display filtered countries formatted_country_pcoll | 'Write filtered dates' >> WriteToText( DIR_PATH + 'output_country.txt') #create new table in BigQuery dataset_id = 'kaggle_modeled' table_id = 'Country_Beam_DF' schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,Country:STRING' # write PCollection to new BQ table formatted_country_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'takes-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = Pipeline(options=options) takes_sql = 'SELECT sid, cno, grade FROM college_workflow_modeled.Takes' class_sql = 'SELECT cid, cno FROM college_workflow_modeled.Class' takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read( beam.io.BigQuerySource(query=takes_sql, use_standard_sql=True)) class_pcoll = p | 'Read from BQ Class' >> beam.io.Read( beam.io.BigQuerySource(query=class_sql, use_standard_sql=True)) # write PCollections to log files takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'takes_query_results.txt') class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'class_query_results.txt') # ParDo with side-input norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo( NormalizeDoFn(), beam.pvalue.AsList(class_pcoll)) # write PCollection to log file norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'norm_takes_pcoll.txt') dataset_id = 'college_workflow_modeled' table_id = 'Takes_Beam_DF' schema_id = 'sid:STRING,cid:STRING,grade:STRING' # write PCollection to new BQ table norm_takes_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'student-df2' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT sid, fname, lname, dob, status FROM college_workflow_modeled.Student' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # standardize the students' date of birth formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(FormatDOBFn()) # write PCollection to log file formatted_dob_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_dob_pcoll.txt') # group students by sid grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey() # write PCollection to log file #grouped_student_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'grouped_student_pcoll.txt') # remove duplicate student records distinct_student_pcoll = grouped_student_pcoll | 'Dedup student records' >> beam.ParDo(DedupStudentRecordsFn()) # write PCollection to log file distinct_student_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'distinct_student_pcoll.txt') dataset_id = 'college_workflow_modeled' table_id = 'Student_Beam_DF' schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING' # write PCollection to new BQ table distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'teacher-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT tid, instructor, dept FROM college_workflow_modeled.Teacher' query_results = p | 'Read from BigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=sql, use_standard_sql=True)) query_results | 'Write log 1' >> WriteToText('query_results.txt') teacher_pcoll = query_results | 'Standardize' >> beam.ParDo( StandardizeDoFn()) teacher_pcoll | 'Write log 2' >> WriteToText('formatted_teacher_pcoll.txt') # group records by tid grouped_pcoll = teacher_pcoll | 'Group by tid' >> beam.GroupByKey() grouped_pcoll | 'Write log 3' >> WriteToText('grouped_teacher.txt') # remove duplicates distinct_pcoll = grouped_pcoll | 'Dedup' >> beam.ParDo(DedupRecordsDoFn()) distinct_pcoll | 'Write log 4' >> WriteToText('distinct_teacher.txt') dataset_id = 'college_workflow_modeled' table_id = 'Teacher_Beam_DF' schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING' distinct_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'studied-brand-266702' # change to your project id BUCKET = 'gs://beam_cs327e_project' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'vaccination-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # run BigQuery query on dataset sql = 'SELECT * FROM vaers_modeled.Vaccination' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write input PCollection to input.txt input_pcoll | 'Write input_pcoll log 1' >> WriteToText( DIR_PATH + 'input_vaccination.txt') # standardize vaccination V_FUNDBY, VAX_ROUTE and VAX_SITE unknown/empty attribute formatted_vaccination_pcoll = input_pcoll | 'Format Unknown Values' >> beam.ParDo( FormatUnknownFn()) # write PCollection to log file formatted_vaccination_pcoll | 'Write log 2' >> WriteToText( DIR_PATH + 'formatted_unknown_pcoll.txt') # specify id and schema dataset_id = 'vaers_modeled' table_id = 'Vaccination_Beam_DF' schema_id = 'VACCINATION_ID:INTEGER, VAERS_ID:INTEGER, VAX_DATE:DATE, VAX_ID:INTEGER, MANU_ID:INTEGER, V_ADMINBY:STRING, V_FUNDBY:STRING, VAX_ROUTE:STRING, VAX_SITE:STRING' # write output PCollection to new BQ table formatted_vaccination_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'starry-center-266501' # change to your project id BUCKET = 'gs://imdb-beam' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'format-date-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT * FROM bollywood_modeled.bollywoodTitles' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to log file query_results | 'Write log 1' >> WriteToText(DIR_PATH + 'query_results.txt') # apply ParDo to format the student's date of birth formatDate_pcoll = query_results | 'Format the dates' >> beam.ParDo( FormatDateFn()) # write PCollection to log file formatDate_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'formatDate_pcoll.txt') dataset_id = 'bollywood_modeled' table_id = 'bollywoodTitles_Beam_DF' schema_id = 'title:STRING,releaseDate:DATE,croresGrossed:NUMERIC' # write PCollection to new BQ table formatDate_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'swift-area-266618' # change to your project id BUCKET = 'gs://nullbusters_data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'student-df5' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT imdb_title_id, title, original_title, year, genre, duration, country, language, director, writer, production_company, actors, description, avg_vote, votes, budget, usa_gross_income, worlwide_gross_income, metascore, reviews_from_users, reviews_from_critics FROM imdb_modeled.Movies WHERE usa_gross_income IS NOT NULL and worlwide_gross_income IS NOT NULL' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) query_results | 'Write log Input' >> WriteToText('input.txt') # apply ParDo to format directors birth year and death years to be ints formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo( FormatYearsFn()) # write PCollection to log file formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'imdb_modeled' table_id = 'Movies_Beam_DF' schema_id = 'imdb_title_id:STRING, title:STRING, original_title:STRING, year:INTEGER, genre:STRING, duration:INTEGER, country:STRING, language:STRING, director:STRING, writer:STRING, production_company:STRING, actors:STRING, description:STRING, avg_votes:FLOAT, votes:INTEGER, budget_currency:STRING, budget:INTEGER, usa_gross_income:INTEGER, worlwide_gross_income_currency:STRING, worlwide_gross_income:INTEGER, metascore:FLOAT, reviews_from_users:FLOAT, reviews_from_critics:FLOAT' # write PCollection to new BQ table formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'responsive-cab-267123' # change to your project id BUCKET = 'gs://bmease_cs327e' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'foodmap-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create beam pipeline using local runner p = Pipeline(options=options) # get average price per year for each food sql = "SELECT LOWER(product_name) AS product_name, product_id,p.aisle_id FROM instacart_modeled.Products p WHERE p.product_name not like '%Filters%' and p.aisle_id NOT IN (11,20,22,25,44,55,73,80,109,118,126,127,132,133,10,54,60,74,75,85,87,101,111,114,56,82,102)" bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to input file query_results | 'Write to input.txt' >> WriteToText(DIR_PATH + 'input.txt') # apply ParDo to format the key, value pairs # key is the food_id and value is a tuple of year and average price that year nom_match_pcoll = query_results | 'Food and matches from nom' >> beam.ParDo( MatchProductFn()) # write PCollection to output file nom_match_pcoll | 'Write to output.txt' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'USDA_ERS_modeled' table_id = 'Food_Map_Beam_DF' schema_id = 'food_id:INTEGER,product_id:INTEGER' # write PCollection to new BQ table nom_match_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'swift-area-266618' # change to your project id BUCKET = 'gs://nullbusters_data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'directors' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT name, birth_name, height, bio, birth_details, birth_year, place_of_birth, death_details, death_year, spouses, divorces, children, known_for_titles, imdb_title_id, director_name_id, category, reason_of_death FROM imdb_modeled.Directors WHERE birth_year IS NOT NULL AND death_year IS NOT NULL' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) query_results | 'Write log Input' >> WriteToText('input.txt') # apply ParDo to format directors birth year and death years to be ints formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo( FormatYearsFn()) # write PCollection to log file formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'imdb_modeled' table_id = 'Directors_Beam_DF' schema_id = 'name:STRING,birth_name:STRING,height:FLOAT,bio:STRING,birth_details:STRING,birth_year:INTEGER,place_of_birth:STRING,death_details:STRING,death_year:INTEGER,spouses:INTEGER,divorces:INTEGER,children:STRING,known_for_titles:STRING,imdb_title_id:STRING,category:STRING,reason_of_death:STRING,director_name_id:STRING' # write PCollection to new BQ table formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'location' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = Pipeline(options=options) sql = 'SELECT * FROM covid_19_modeled.Location_SQL_1' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # extract city from state state_pcoll = query_results | 'Format State' >> beam.ParDo(FormatStateFn()) grouped_pcoll = state_pcoll | 'Group Locations' >> beam.GroupByKey() unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo( RemoveDuplicatesFn()) dataset_id = 'covid_19_modeled' table_id = 'Location_Beam_DF' schema_id = 'id:INTEGER,city:STRING,state:STRING,country:STRING,latitude:NUMERIC,longitude:NUMERIC,fips:INTEGER,admin2:STRING,combined_key:STRING' # write PCollection to BQ table unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'event' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = Pipeline(options=options) sql = 'SELECT * FROM covid_19_modeled.Event_SQL_1' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # format timestamp ts_pcoll = query_results | 'Format Timestamp' >> beam.ParDo(FormatTimestampFn()) # group by primary key grouped_pcoll = ts_pcoll | 'Group by PK' >> beam.GroupByKey() # remove duplicate records unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo(RemoveDuplicatesFn()) # write new PCollection to log file unique_pcoll | 'Write log' >> WriteToText(DIR_PATH + 'unique_pcoll.txt') dataset_id = 'covid_19_modeled' table_id = 'Event_Beam_DF' schema_id = '''location_id:INTEGER,last_update:DATETIME,confirmed:INTEGER,deaths:INTEGER,recovered:INTEGER,active:INTEGER''' # write PCollection to BQ table unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'spry-cosine-266801' BUCKET = 'gs://icyhot-pack_beam' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'location-df' # dataflow does not like '_' or special characters google_cloud_options.staging_location = BUCKET + '/staging' #req* google_cloud_options.temp_location = BUCKET + '/temp' #req* options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT id, province_state, country_region FROM covid19_jhu_csse_modeled.location_id' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # format US formatted_us_pcoll = query_results | 'Format US' >> beam.ParDo( FormatUSFn()) # write PCollection to log file formatted_us_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_us_pcoll.txt') dataset_id = 'covid19_jhu_csse_modeled' table_id = 'location_id_Beam_DF' schema_id = 'id:INTEGER, province_state:STRING, country_region:STRING' # write PCollection to new BQ table formatted_us_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'starry-center-266501' # change to your project id BUCKET = 'gs://imdb-beam' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'split-characters-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT DISTINCT tConst, nConst, job, characters FROM imdb_modeled.Characters' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # apply ParDo to split the directors titles # call pardo, pipe query results to pardo split_characters_pcoll = query_results | 'Return title: director i dictonaries' >> beam.ParDo( SplitCharactersFn()) # write PCollection to log file split_characters_pcoll | 'Write log 1' >> WriteToText( DIR_PATH + 'split_characters_pcoll.txt') dataset_id = 'imdb_modeled' table_id = 'Characters_Beam_DF' schema_id = 'tConst:STRING, nConst:STRING, job:STRING, characters:STRING' # write PCollection to new BQ table split_characters_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): pipeline_options = PipelineOptions() google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) google_cloud_options.project = settings['gcp']['project'] google_cloud_options.job_name = settings['job']['name'] google_cloud_options.staging_location = f"gs://{settings['gcp']['bucket']}/staging" google_cloud_options.temp_location = f"gs://{settings['gcp']['bucket']}/temp" pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner' pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as pipe: text = pipe | 'Read from text file' >> beam.io.ReadFromText( f"gs://twitter_data_etl/stuff/{settings['job']['input']}") tokens = ( text | 'Remove RT from text' >> beam.Map(strip_extra, chars='RT') | 'Remove twitter handles' >> beam.Regex.replace_all( r'@[a-zA-Z0-9_]+', "") | 'Remove all url links' >> beam.Regex.replace_all( r'http[s]*://[a-zA-Z0-9_\.\/]+', '') | 'Remove punctuation' >> beam.Map(lambda text: text.translate( str.maketrans("", "", string.punctuation))) | 'Remove all tabs' >> beam.Map(lambda text: text.replace("\t", "")) | 'Remove periods (not removed in punctuation step?)' >> beam.Map(lambda text: text.replace(".", "")) | 'Make all lowercase' >> beam.Map(lambda text: text.lower()) | 'Split tweets into words' >> beam.ParDo(ExtractWordFromTweets())) words = ( tokens | 'Prepare word tuples' >> beam.Map(lambda word: (word, 1)) | 'Group and sum the words to get counts' >> beam.CombinePerKey(sum) | 'Save to file' >> beam.io.WriteToText( settings['job']['output_words'])) # This worked locally ... was having issues with getting the worker package installation in dataflow # emojis = (tokens | 'Filter to keep emojis only' >> beam.Filter(lambda token: token if token in emoji.UNICODE_EMOJI else False) # | 'Prepare emoji tuples' >> beam.Map(lambda emoji: (emoji, 1)) # | 'Group and sum the emojis to get counts' >> beam.CombinePerKey(sum) # | 'Save emojis to text' >> beam.io.WriteToText(settings['job']['output_emojis']) # ) result = pipe.run() result.wait_until_finish()
def main(unused_argv): PyPIArtifactRegistry.register_artifact('beautifulsoup4', '>=4.9,<5.0') parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') parser.add_argument('--fully_qualified_name_glob', default=None) options = parser.parse_args() global server with fully_qualified_named_transform.FullyQualifiedNamedTransform.with_filter( options.fully_qualified_name_glob): server = grpc.server(thread_pool_executor.shared_unbounded_instance()) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer( PipelineOptions([ "--experiments", "beam_fn_api", "--sdk_location", "container" ])), server) beam_artifact_api_pb2_grpc.add_ArtifactRetrievalServiceServicer_to_server( artifact_service.ArtifactRetrievalService( artifact_service.BeamFilesystemHandler(None).file_reader), server) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', options.port) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') parser.add_argument('--fully_qualified_name_glob', default=None) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions( pipeline_args + ["--experiments=beam_fn_api", "--sdk_location=container"]) with fully_qualified_named_transform.FullyQualifiedNamedTransform.with_filter( known_args.fully_qualified_name_glob): server = grpc.server(thread_pool_executor.shared_unbounded_instance()) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer(pipeline_options), server) beam_artifact_api_pb2_grpc.add_ArtifactRetrievalServiceServicer_to_server( artifact_service.ArtifactRetrievalService( artifact_service.BeamFilesystemHandler(None).file_reader), server) server.add_insecure_port('localhost:{}'.format(known_args.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', known_args.port) def cleanup(unused_signum, unused_frame): _LOGGER.info('Shutting down expansion service.') server.stop(None) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def test_retry_fork_graph(self): pipeline_options = PipelineOptions(['--direct_runner_bundle_retry']) p = beam.Pipeline(options=pipeline_options) # TODO(mariagh): Remove the use of globals from the test. global count_b, count_c # pylint: disable=global-variable-undefined count_b, count_c = 0, 0 def f_b(x): global count_b # pylint: disable=global-variable-undefined count_b += 1 raise Exception('exception in f_b') def f_c(x): global count_c # pylint: disable=global-variable-undefined count_c += 1 raise Exception('exception in f_c') names = p | 'CreateNodeA' >> beam.Create(['Ann', 'Joe']) fork_b = names | 'SendToB' >> beam.Map(f_b) # pylint: disable=unused-variable fork_c = names | 'SendToC' >> beam.Map(f_c) # pylint: disable=unused-variable with self.assertRaises(Exception): p.run().wait_until_finish() assert count_b == count_c == 4
def main(): args = parse_args() pipeline_options = PipelineOptions(**vars(args)) pipeline = beam.Pipeline(options=pipeline_options) train_files = glob.glob("./mnist_images/train" + os.sep + "*.jpg") eval_files = glob.glob("./mnist_images/eval" + os.sep + "*.jpg") _ = ( pipeline | 'ListTrainFiles' >> beam.Create(train_files) | 'TrainReadFiles' >> beam.Map(lambda path: (read_from_path(path))) | 'WriteToTrainTfrecord' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=path.join("mnist_tfrecords", "train", "train"), compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED, coder=coders.ExampleProtoCoder(tfrecord_schema()), file_name_suffix='.tfrecord')) _ = ( pipeline | 'ListEvalFiles' >> beam.Create(eval_files) | 'EvalReadFiles' >> beam.Map(lambda path: (read_from_path(path))) | 'WriteToEvalTfrecord' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=path.join("mnist_tfrecords", "eval", "eval"), compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED, coder=coders.ExampleProtoCoder(tfrecord_schema()), file_name_suffix='.tfrecord')) pipeline.run().wait_until_finish()
def run(): PROJECT_ID = 'corvid-276516' BUCKET = 'gs://covid-bucket19' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'format-codes--df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT * FROM covid_staging.googleMobility ORDER BY date, country_region' # passing a query. Shouldn't process more than 1000 records w DR bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection # call pardo, pipe query results to pardo format_alphaCode_pcoll = query_results | 'Change the country code for Greece, the UK, and Hong Kong. Drop Reunion' >> beam.ParDo(format_alphaCodeFn()) # write PCollection to log file format_alphaCode_pcoll | 'Write log 1' >> WriteToText('geodist_beam.txt') dataset_id = 'covid_modeled' table_id = 'mobility_beam' schema_id = 'code:STRING, country:STRING, date:DATE, average_change:INTEGER, retail_and_recreation:INTEGER, grocery_and_pharmacy:INTEGER, parks:INTEGER, transit_stations:INTEGER, workplaces:INTEGER,residential:INTEGER' # write PCollection to new BQ table format_alphaCode_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def test_view_as(self): generic_options = PipelineOptions(['--slices=3']) self.assertEqual(3, generic_options.view_as(Bacon).slices) self.assertEqual(3, generic_options.view_as(Breakfast).slices) generic_options.view_as(Breakfast).slices = 10 self.assertEqual(10, generic_options.view_as(Bacon).slices) with self.assertRaises(AttributeError): generic_options.slices # pylint: disable=pointless-statement with self.assertRaises(AttributeError): generic_options.view_as(Eggs).slices # pylint: disable=expression-not-assigned
def run(): PROJECT_ID = 'data-lake-290221' BUCKET = 'gs://dataflow-log-data' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DirectRunner', project=PROJECT_ID, job_name='transpose', temp_location=BUCKET + '/temp', region='us-central1') options.view_as(SetupOptions).save_main_session = True p = beam.pipeline.Pipeline(options=options) sql = '''select farm_fingerprint(concat(cast(latitude as string), cast(longitude as string))) as location_id, * from covid19_confirmed.raw_cases''' #bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) bq_source = BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Transpose' >> beam.ParDo(Transpose()) #out_pcoll | 'Write to log' >> WriteToText('records.txt') dataset_id = 'covid19_confirmed' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'daily_cases' schema_id = 'location_id:INTEGER,date:DATE,cases:INTEGER' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def test_view_as(self): generic_options = PipelineOptions(['--slices=3']) self.assertEquals(3, generic_options.view_as(Bacon).slices) self.assertEquals(3, generic_options.view_as(Breakfast).slices) generic_options.view_as(Breakfast).slices = 10 self.assertEquals(10, generic_options.view_as(Bacon).slices) with self.assertRaises(AttributeError): generic_options.slices # pylint: disable=pointless-statement with self.assertRaises(AttributeError): generic_options.view_as(Eggs).slices # pylint: disable=expression-not-assigned
def main(unused_argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') options = parser.parse_args() global server server = grpc.server(futures.ThreadPoolExecutor(max_workers=2)) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer(PipelineOptions()), server) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() logging.info('Listening for expansion requests at %d', options.port) # blocking main thread forever. signal.pause()
def main(unused_argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') options = parser.parse_args() global server server = grpc.server(UnboundedThreadPoolExecutor()) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer( PipelineOptions(["--experiments", "beam_fn_api"])), server) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', options.port) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def main(unused_argv): parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, help='port on which to serve the job api') options = parser.parse_args() global server server = grpc.server(UnboundedThreadPoolExecutor()) # DOCKER SDK Harness beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_service.ExpansionServiceServicer( PipelineOptions([ "--experiments", "beam_fn_api", "--sdk_location", "container" ])), server) # PROCESS SDK Harness # beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( # expansion_service.ExpansionServiceServicer( # PipelineOptions.from_dictionary({ # 'environment_type': 'PROCESS', # 'environment_config': '{"command": "sdks/python/container/build/target/launcher/darwin_amd64/boot"}', # 'experiments': 'beam_fn_api', # 'sdk_location': 'container', # }) # ), server # ) server.add_insecure_port('localhost:{}'.format(options.port)) server.start() _LOGGER.info('Listening for expansion requests at %d', options.port) signal.signal(signal.SIGTERM, cleanup) signal.signal(signal.SIGINT, cleanup) # blocking main thread forever. signal.pause()
def run(): PROJECT_ID = 'electric-spark-266716' # change to your project id BUCKET = 'gs://global_surface_temperatures' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'population-statistics-beam-dataflow-2' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) #create query to select all elements for cleansing sql = 'SELECT * FROM kaggle2_modeled.Population_Statistics' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) #read desired table from BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) #write inputs to input.txt query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt') # apply ParDo to filter out dates transposed_date_pcoll = query_results | 'Transpose Dates' >> beam.ParDo( TransposeDateFn()) #write filtered dates to filtered.txt transposed_date_pcoll | 'Write transpose Dates' >> WriteToText( DIR_PATH + 'transposed.txt') #flatten list to get individual records flatten_pcoll = transposed_date_pcoll | 'Flatten lists' >> beam.FlatMap( generate_elements) #write resulting PColleciton to output.txt flatten_pcoll | 'Write output' >> WriteToText(DIR_PATH + 'output_final_dates.txt') #create new table in BigQuery dataset_id = 'kaggle2_modeled' table_id = 'Population_Statistics_Beam_DF' schema_id = 'dt:DATE,countryName:STRING,countryCode:STRING, \ metric:STRING,metricCode:STRING,statistic:FLOAT' # write PCollection to new BQ table flatten_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'studied-brand-266702' # change to your project id BUCKET = 'gs://beam_cs327e_project' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'adverse-event-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # run BigQuery query on dataset sql = 'SELECT * FROM vaers_modeled.Adverse_Event' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write input PCollection to input.txt input_pcoll | 'Write input_pcoll log 1' >> WriteToText(DIR_PATH + 'input.txt') # standardize adverse_event RECOVD attribute values into true or false (boolean) formatted_recovd_pcoll = input_pcoll | 'Format RECOVD' >> beam.ParDo(FormatRECOVDFn()) # write PCollection to log file formatted_recovd_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'formatted_recovd_pcoll.txt') # standardize adverse_event BIRTH_DEFECT attribute values into true or false (boolean) formatted_defect_pcoll = formatted_recovd_pcoll | 'Format BIRTH_DEFECT' >> beam.ParDo(FormatBIRTH_DEFECTFn()) # write PCollection to log file formatted_defect_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'formatted_defect_pcoll.txt') # standardize boolean attribute values which are null into false output_pcoll = formatted_defect_pcoll | 'Format boolean attributes' >> beam.ParDo(FormatBooleanAttributesFn()) # write output PCollection to output.txt output_pcoll | 'Write output_pcoll log 4' >> WriteToText(DIR_PATH + 'output.txt') # specify id and schema dataset_id = 'vaers_modeled' table_id = 'Adverse_Event_Beam_DF' # change RECOVD and BIRTH_DEFECT attributes into BOOLEANS schema_id = 'VAERS_ID:INTEGER, ONSET_DATE:DATE, RECOVD:BOOLEAN, DIED:BOOLEAN, DATEDIED:DATE, L_THREAT:BOOLEAN, OFC_VISIT:BOOLEAN, ER_VISIT:BOOLEAN, ER_ED_VISIT:BOOLEAN, HOSPITAL:BOOLEAN, HOSPDAYS:INTEGER, X_STAY:BOOLEAN, DISABLE:BOOLEAN, BIRTH_DEFECT:BOOLEAN, OTHER_MEDS:STRING, CUR_ILL:STRING, HISTORY:STRING, PRIOR_VAX:STRING' # write output PCollection to new BQ table output_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
import apache_beam as beam import re from apache_beam.pipeline import PipelineOptions pipeline_args = [ #1. DataflowRunner runs the pipeline on Google Cloud Dataflow '--runner=DataflowRunner', #2. Google Cloud Project ID '--project=your-project-ID', #3. Google Cloud Storage path is required for staging local files '--staging_location=gs://your-bucket-name/staging', #4. Google Cloud Storage path is required for temporary files '--temp_location=gs://your-bucket-name/temp', #5. (Optional) Job name to be displayed in the logs '--job_name=word-count-job' ] pipeline_options = PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=pipeline_options) # Data Transforms (pipeline | 'read file' >> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') | 'get words' >> beam.FlatMap(lambda x: re.findall(r'\w+', x)) | 'count words' >> beam.combiners.Count.PerElement() | 'save' >> beam.io.WriteToText('gs://your-bucket-name/wordcount-output.txt')) pipeline.run()
import apache_beam as beam from apache_beam.pipeline import PipelineOptions options = PipelineOptions() class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='./data/') parser.add_argument('--output', help='Output for the pipeline', default='./output/') class Split(beam.DoFn): def process(self, element): Date, Open, High, Low, Close, Volume = element.split(',') try: return [{'Open': float(Open), 'Close': float(Close)}] except: return [{'Open': 0.0, 'Close': 0.0}] class CollectOpen(beam.DoFn): def process(self, element): result = [(1, element['Open'])] return result
def __init__(self): super(EventsPipeline, self).__init__(options=PipelineOptions())
def run(in_args=None): """Runs the pre-processing pipeline.""" pipeline_options = PipelineOptions.from_dictionary(vars(in_args)) with beam.Pipeline(options=pipeline_options) as p: configure_pipeline(p, in_args)