def run_flight_delay_pipeline(pipeline, start_date=None, end_date=None, output=None): query = f""" SELECT FlightDate AS date, IATA_CODE_Reporting_Airline AS airline, Origin AS departure_airport, Dest AS arrival_airport, DepDelay AS departure_delay, ArrDelay AS arrival_delay FROM `apache-beam-testing.airline_ontime_data.flights` WHERE FlightDate >= '{start_date}' AND FlightDate <= '{end_date}' AND DepDelay IS NOT NULL AND ArrDelay IS NOT NULL """ # Import this here to avoid pickling the main session. import time from apache_beam import window def to_unixtime(s): return time.mktime(s.timetuple()) # The pipeline will be run on exiting the with block. with pipeline as p: tbl = ( p | 'read table' >> beam.io.ReadFromBigQuery(query=query, use_standard_sql=True) | 'assign timestamp' >> beam.Map( lambda x: window.TimestampedValue(x, to_unixtime(x['date']))) # Use beam.Select to make sure data has a schema # The casts in lambdas ensure data types are properly inferred | 'set schema' >> beam.Select( date=lambda x: str(x['date']), airline=lambda x: str(x['airline']), departure_airport=lambda x: str(x['departure_airport']), arrival_airport=lambda x: str(x['arrival_airport']), departure_delay=lambda x: float(x['departure_delay']), arrival_delay=lambda x: float(x['arrival_delay']))) daily = tbl | 'daily windows' >> beam.WindowInto( beam.window.FixedWindows(60 * 60 * 24)) # group the flights data by carrier df = to_dataframe(daily) result = df.groupby('airline').apply(get_mean_delay_at_top_airports) result.to_csv(output)
def test_simple_df_with_beam_row(self): expected = pd.DataFrame( { 'name': list(str(i) for i in range(5)), 'id': list(range(5)), 'height': list(float(i) for i in range(5)) }, columns=['name', 'id', 'height']) with TestPipeline() as p: res = (p | beam.Create([(str(i), i, float(i)) for i in range(5)]) | beam.Select(name=lambda r: str(r[0]), id=lambda r: int(r[1]), height=lambda r: float(r[2])) | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10)) assert_that(res, matches_df(expected))