def run(): external_table_name = 'deb.missing_routes' table_name = 'deb.routes' bq = BigQueryUtils() # create external table logger.info(f"creating external table: {external_table_name}") # todo: <<ADD YOUR CODE>>. Use the BigQueryUtils class above. logger.info(f"inserting missing routes...") # todo: <<ADD YOUR CODE>>. Use the BigQueryUtils class with a "INSERT INTO SELECT" statement # (optional) delete external table logger.info(f"deleting external table: {external_table_name}") bq.delete_table(external_table_name)
def run(): external_table_name = 'deb.missing_routes' table_name = 'deb.routes' bq = BigQueryUtils() # create external table logger.info(f"creating external table: {external_table_name}") schema = [ SchemaField('airline', 'string'), SchemaField('src', 'string'), SchemaField('dest', 'string') ] bq.create_external_table( external_table_name, source_uris= 'gs://deb.gcs.turalabs.com/beam/ch2ep2/output/rejects/missing-routes*.csv', schema=schema, source_format='CSV', delete_if_exists=True, skip_leading_rows=1) # delete previously inserted rows logger.info("deleting previously inserted missing routes...") sql = f"DELETE FROM {table_name} WHERE equipment = '-'" bq.execute(sql) logger.info(f"inserting missing routes...") sql = f"""INSERT INTO {table_name} SELECT airline, src, dest, NULL as codeshare, 1 as stops, '-' as equipment FROM {external_table_name} """ bq.execute(sql) # (optional) delete external table logger.info(f"deleting external table: {external_table_name}") bq.delete_table(external_table_name)
def run_simple(): t0 = time() # parse command line arguments known_args, beam_args = runtime_args() # BigQuery Utility bq_utils = BigQueryUtils() # pass in the pipeline options options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # todo: <<ADD YOUR CODE HERE>> # todo: calling beam transforms to call rest API, transform records, and output them into files pass # todo: create an external table using the output files and insert records into BigQuery logger.info(f"process completed in {(time() - t0):,.3f} seconds")
def run(): t0 = now() # parse command line options known_args, beam_args = runtime_args() # BigQuery utility bq_utils = BigQueryUtils() # setup apache beam args options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # todo: finish writing this code. you can cheat and look at deb.ch2.ep3.answers rows = (p | beam.io.ReadFromText(known_args.input, skip_header_lines=1) | beam.ParDo(BeamReadCSV(header_cols=CSV_COLS)) | beam.ParDo(BeamTransformRecords(), date_fmt='%Y-%m-%d', time_fmt='%H%M')) output = (rows | beam.io.WriteToParquet(os.path.join( known_args.output, 'flights'), schema=get_schema_parquet(), file_name_suffix='.parquet')) json_output = ( rows | beam.Map( lambda e: { k: v if k != 'flight_date' else v.strftime('%Y-%m-%d') for k, v in e.items() }) | beam.Map(lambda e: json.dumps(e)) | beam.io.WriteToText(os.path.join(known_args.output, 'flights'), file_name_suffix='.json')) logger.info(f"total time: {(now() - t0):,.6f} seconds")
def run(): t0 = now() # parse command line options known_args, beam_args = runtime_args() # BigQuery utility bq_utils = BigQueryUtils() options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: rows = (p | beam.io.ReadFromText(known_args.input, skip_header_lines=1) | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS)) | beam.ParDo(BeamTransformRecords(), date_fmt='%Y-%m-%d', time_fmt='%H%M')) # load the routes table into a lookup dict sql = f"""select airline, src, dest from {known_args.routes_table}""" routes = bq_utils.execute_as_dict(sql, keycols=['airline', 'src', 'dest']) # lookup routes rows, routes_rejects, missing_routes = ( rows | beam.ParDo(BeamLookupRoute(), routes=routes).with_outputs( 'rejects', 'missing_routes', main='main')) # write parquet output files output = (rows | beam.io.WriteToParquet( os.path.join(known_args.output, 'flights'), schema=datamodel_flights_parquet_schema(), file_name_suffix='.parquet')) # write missing routes to another output as CSV output_routes = ( missing_routes | "gbr" >> beam.GroupByKey() # calculate distinct missing routes | "missing_routes_csv" >> beam.Map( lambda e: ','.join(list(e[0])) ) # csv output the key (e[0] of key value tuple) which is (airline,src,dest) | "missing_routes_out" >> beam.io.WriteToText( os.path.join(known_args.output, 'rejects/missing-routes'), file_name_suffix='.csv', header='airline,src,dest')) # alternative: write (simple) newline delimited json output files # a very flexible output file format for bigquery and other big data tools # much slower to write and larger in size than binary formats such as Parquet, ORC, or Avro # but provides flexibility over schema for smaller data files # larger file sizes should use Avro, Parquet, ORC. Avro provides fastest write speeds where # parquet and orc provide faster read performance for analytical queries # output = (rows # | beam.Map(lambda e: {k: v if k != 'flight_date' else v.strftime('%Y-%m-%d') for k, v in e.items()}) # convert flight_date back to string type for json conversion # | beam.Map(lambda e: json.dumps(e)) # json dump row # | beam.io.WriteToText(os.path.join(known_args.output, 'flights'), # file_name_suffix='.json') # ) logger.info("beam pipiline completed.") # create bigquery external table and insert into bq flights table bq_utils.create_external_table(known_args.flights_ext_table, source_uris=os.path.join( known_args.output, "flights*.parquet"), source_format='PARQUET', delete_if_exists=True) # create and replace existing bigquery flights table bq_utils.create_table(known_args.flights_table, schema=datamodel_flights_bigquery_schema(), delete_if_exists=True) # insert into table as select (itas) statement sql = f""" INSERT INTO `{known_args.flights_table}` SELECT a.day_of_week, a.flight_date, a.airline, a.tailnumber, a.flight_number, a.src, a.src_city, a.src_state, a.dest, a.dest_city, a.dest_state, PARSE_TIME('%H:%M:%S', a.departure_time) as departure_time, PARSE_TIME('%H:%M:%S', a.actual_departure_time) as actual_departure_time, a.departure_delay, a.taxi_out, PARSE_TIME('%H:%M:%S', a.wheels_off) as wheels_off, PARSE_TIME('%H:%M:%S', a.wheels_on) as wheels_on, a.taxi_in, PARSE_TIME('%H:%M:%S', a.arrival_time) as arrival_time, PARSE_TIME('%H:%M:%S', a.actual_arrival_time) as actual_arrival_time, a.arrival_delay, a.cancelled, a.cancellation_code, a.flight_time, a.actual_flight_time, a.air_time, a.flights, a.distance, a.airline_delay, a.weather_delay, a.nas_delay, a.security_delay, a.late_aircraft_delay, -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) AS flightDate_airline_flightNumber FROM `{known_args.flights_ext_table}` a """ # insert records form parquet external table into final bq managed flights table r = bq_utils.execute(sql) logger.info(f"total time: {(now() - t0):,.6f} secs")
def run_with_lookups(): t0 = time() # parse command line arguments known_args, pipeline_args = runtime_args() # BigQuery Utility bq_utils = BigQueryUtils() options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # pre-process: create a list of date to process and get other side-inputs # create a list of flights date to retrieve from api days = list_dates(start_date=known_args.start_date, end_date=known_args.end_date) # get airline iata codes from the api airlines = api_get_airlines(api_url=known_args.api_url, api_token=known_args.api_token) # read airports into a lookup dict with iata code as dict key sql = f"""SELECT iata, city, state, lat, long, tz, utc_offset FROM {known_args.airports_table}""" airports = bq_utils.execute_as_dict(sql, keycols='iata') # read routes into a lookup dict with (airline, src, dest) as dict key # more info about bq parameterized queries: https://cloud.google.com/bigquery/docs/parameterized-queries sql = f"""SELECT airline, src, dest FROM {known_args.routes_table} WHERE airline in UNNEST(@airlines)""" routes = bq_utils.execute_as_dict(sql, keycols=['airline', 'src', 'dest'], query_params=[bigquery.ArrayQueryParameter("airlines", "STRING", airlines)]) # create a beam collection with all days and airlines to get flights for input_rows = (p | beam.Create(days) | beam.ParDo(BeamExpandDaysByAirlines(), airlines=airlines) ) # call flights api to get flights for each record above and # call the beam transforms to process the input flights flights = (input_rows | beam.ParDo(BeamGetFlights(), api_url=known_args.api_url, api_token=known_args.api_token) | beam.ParDo(BeamTransformFlights()) ) # lookup src/dest airports flights, airport_rejects, missing_airports = \ flights | beam.ParDo(BeamLookupAirport(), airports=airports).with_outputs('rejects', 'missing_airport', main='main') # lookup routes flights, routes_rejects, missing_routes = \ flights | beam.ParDo(BeamLookupRoute(), routes=routes).with_outputs('rejects', 'missing_route', main='main') # write main flight output. records transformed and lookup checks completed (flights | "flights_json" >> beam.Map(lambda e: json.dumps(e)) | "flights_output" >> beam.io.WriteToText(os.path.join(known_args.output, 'flights'), file_name_suffix='.json') ) # write out rejects # airport rehects (airport_rejects | "airport_rejects_json" >> beam.Map(lambda e: json.dumps(e)) | "airport_rejects_output" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/airport_rejects'), file_name_suffix='.json') ) # routes rejects (routes_rejects | "route_rejects_json" >> beam.Map(lambda e: json.dumps(e)) | "route_rejects_output" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/routes-rejects'), file_name_suffix='.json') ) # missing airports (missing_airports | "gba" >> beam.GroupByKey() | "missing_airports_csv" >> beam.Map(lambda e: str(e[0])) | "missing_airport_out" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/missing_airports'), file_name_suffix='.csv', header='iata') ) (missing_routes | "gbr" >> beam.GroupByKey() | "missing_routes_csv" >> beam.Map(lambda e: ','.join(list(e[0]))) | "missing_routes_out" >> beam.io.WriteToText(os.path.join(known_args.output, 'rejects/missing_routes'), file_name_suffix='.csv', header='airline,src,dest') ) logger.info("apache beam pipeline done") # create bigquery external table logger.info("dropping and creating bigquery external table...") bq_utils.create_external_table(known_args.flights_ext_table, source_uris=os.path.join(known_args.output, "flights*.json"), schema=FUTURE_FLIGHTS_BIGQUERY_SCHEMA, delete_if_exists=True) # delete existing flights greater than start_date sql = f"""DELETE FROM {known_args.flights_table} WHERE flight_date >= '{datetime.strftime(known_args.start_date, '%Y-%m-%d')}'""" bq_utils.execute(sql) # insert flights records into final table sql = f"""INSERT INTO {known_args.flights_table} (SELECT a.day_of_week , a.flight_date , a.airline , a.tailnumber , a.flight_number , a.src , b.city as src_city , b.state as src_state , a.dest , c.city as dest_city , c.state as dest_state , a.departure_time , NULL as actual_departure_time , NULL as departure_delay , NULL as taxi_out , NULL as wheels_off , NULL as wheels_on , NULL as taxi_in , a.arrival_time , NULL as actual_arrival_time , NULL as arrival_delay , False as cancelled , NULL as cancellation_code , a.flight_time , NULL as actual_flight_time , NULL as air_time , 1 as flights , a.distance , NULL as airline_delay , NULL as weather_delay , NULL as nas_delay , NULL as security_delay , NULL as late_aircraft_delay , -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) as flightDate_airline_flightNumber FROM {known_args.flights_ext_table} a LEFT JOIN {known_args.airports_table} b on a.src = b.iata LEFT JOIN {known_args.airports_table} c on a.dest = c.iata )""" bq_utils.execute(sql) logger.info(f"process completed in {(time() - t0):,.3f} seconds")
def run_simple(): t0 = time() # parse command line arguments known_args, beam_args = runtime_args() # BigQuery Utility bq_utils = BigQueryUtils() # pass in the pipeline options options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # pre-process: create a list of date to process and get other side-inputs # create a list of flights date to retrieve from api days = list_dates(start_date=known_args.start_date, end_date=known_args.end_date) # get airline iata codes from the api airlines = api_get_airlines(api_url=known_args.api_url, api_token=known_args.api_token) # create a beam collection with all days and airlines to get flights for input_rows = (p | beam.Create(days) | beam.ParDo(BeamExpandDaysByAirlines(), airlines=airlines) ) # call flights api to get flights for each record above and # call the beam transforms to process the input flights flights = (input_rows | beam.ParDo(BeamGetFlights(), api_url=known_args.api_url, api_token=known_args.api_token) | beam.ParDo(BeamTransformFlights()) ) # prepare & write output files json_output = (flights | beam.Map(lambda e: json.dumps(e)) | beam.io.WriteToText(os.path.join(known_args.output, 'flights'), file_name_suffix='.json') ) logger.info("apache beam pipeline done") # create bigquery external table logger.info("dropping and creating bigquery external table...") bq_utils.create_external_table(known_args.flights_ext_table, source_uris=os.path.join(known_args.output, "flights*.json"), schema=FUTURE_FLIGHTS_BIGQUERY_SCHEMA, delete_if_exists=True) # delete existing flights greater than start_date sql = f"""DELETE FROM {known_args.flights_table} WHERE flight_date >= '{datetime.strftime(known_args.start_date, '%Y-%m-%d')}'""" bq_utils.execute(sql) # insert flights records into final table sql = f"""INSERT INTO {known_args.flights_table} (SELECT a.day_of_week , a.flight_date , a.airline , a.tailnumber , a.flight_number , a.src , b.city as src_city , b.state as src_state , a.dest , c.city as dest_city , c.state as dest_state , a.departure_time , NULL as actual_departure_time , NULL as departure_delay , NULL as taxi_out , NULL as wheels_off , NULL as wheels_on , NULL as taxi_in , a.arrival_time , NULL as actual_arrival_time , NULL as arrival_delay , False as cancelled , NULL as cancellation_code , a.flight_time , NULL as actual_flight_time , NULL as air_time , 1 as flights , a.distance , NULL as airline_delay , NULL as weather_delay , NULL as nas_delay , NULL as security_delay , NULL as late_aircraft_delay , -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) as flightDate_airline_flightNumber FROM {known_args.flights_ext_table} a LEFT JOIN {known_args.airports_table} b on a.src = b.iata LEFT JOIN {known_args.airports_table} c on a.dest = c.iata )""" bq_utils.execute(sql) logger.info(f"process completed in {(time() - t0):,.3f} seconds")
def run(): t0 = now() # parse command line options known_args, beam_args = runtime_args() # BigQuery utility bq_utils = BigQueryUtils() options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: rows = (p | beam.io.ReadFromText(known_args.input, skip_header_lines=1) | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS)) | beam.ParDo(BeamTransformRecords(), date_fmt='%Y-%m-%d', time_fmt='%H%M') ) # write parquet output files output = (rows | beam.io.WriteToParquet(os.path.join(known_args.output, 'flights'), schema=datamodel_flights_parquet_schema(), file_name_suffix='.parquet') ) logger.info("beam pipiline completed.") # create bigquery external table and insert into bq flights table bq_utils.create_external_table(known_args.flights_ext_table, source_uris=os.path.join(known_args.output, "flights*.parquet"), source_format='PARQUET', delete_if_exists=True) # create and replace existing bigquery flights table bq_utils.create_table(known_args.flights_table, schema=datamodel_flights_bigquery_schema(), delete_if_exists=True) # insert into table as select (itas) statement sql = f""" INSERT INTO `{known_args.flights_table}` SELECT a.day_of_week, a.flight_date, a.airline, a.tailnumber, a.flight_number, a.src, a.src_city, a.src_state, a.dest, a.dest_city, a.dest_state, PARSE_TIME('%H:%M:%S', a.departure_time) as departure_time, PARSE_TIME('%H:%M:%S', a.actual_departure_time) as actual_departure_time, a.departure_delay, a.taxi_out, PARSE_TIME('%H:%M:%S', a.wheels_off) as wheels_off, PARSE_TIME('%H:%M:%S', a.wheels_on) as wheels_on, a.taxi_in, PARSE_TIME('%H:%M:%S', a.arrival_time) as arrival_time, PARSE_TIME('%H:%M:%S', a.actual_arrival_time) as actual_arrival_time, a.arrival_delay, a.cancelled, a.cancellation_code, a.flight_time, a.actual_flight_time, a.air_time, a.flights, a.distance, a.airline_delay, a.weather_delay, a.nas_delay, a.security_delay, a.late_aircraft_delay, -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) AS flightDate_airline_flightNumber FROM `{known_args.flights_ext_table}` a """ # insert records form parquet external table into final bq managed flights table r = bq_utils.execute(sql) logger.info(f"total time: {(now() - t0):,.6f} secs")