Exemplo n.º 1
0
def run_flight_delay_pipeline(pipeline,
                              start_date=None,
                              end_date=None,
                              output=None):
    query = f"""
  SELECT
    FlightDate AS date,
    IATA_CODE_Reporting_Airline AS airline,
    Origin AS departure_airport,
    Dest AS arrival_airport,
    DepDelay AS departure_delay,
    ArrDelay AS arrival_delay
  FROM `apache-beam-testing.airline_ontime_data.flights`
  WHERE
    FlightDate >= '{start_date}' AND FlightDate <= '{end_date}' AND
    DepDelay IS NOT NULL AND ArrDelay IS NOT NULL
  """

    # Import this here to avoid pickling the main session.
    import time
    from apache_beam import window

    def to_unixtime(s):
        return time.mktime(s.timetuple())

    # The pipeline will be run on exiting the with block.
    with pipeline as p:
        tbl = (
            p
            | 'read table' >> beam.io.ReadFromBigQuery(query=query,
                                                       use_standard_sql=True)
            | 'assign timestamp' >> beam.Map(
                lambda x: window.TimestampedValue(x, to_unixtime(x['date'])))
            # Use beam.Select to make sure data has a schema
            # The casts in lambdas ensure data types are properly inferred
            | 'set schema' >> beam.Select(
                date=lambda x: str(x['date']),
                airline=lambda x: str(x['airline']),
                departure_airport=lambda x: str(x['departure_airport']),
                arrival_airport=lambda x: str(x['arrival_airport']),
                departure_delay=lambda x: float(x['departure_delay']),
                arrival_delay=lambda x: float(x['arrival_delay'])))

        daily = tbl | 'daily windows' >> beam.WindowInto(
            beam.window.FixedWindows(60 * 60 * 24))

        # group the flights data by carrier
        df = to_dataframe(daily)
        result = df.groupby('airline').apply(get_mean_delay_at_top_airports)
        result.to_csv(output)
Exemplo n.º 2
0
    def test_simple_df_with_beam_row(self):
        expected = pd.DataFrame(
            {
                'name': list(str(i) for i in range(5)),
                'id': list(range(5)),
                'height': list(float(i) for i in range(5))
            },
            columns=['name', 'id', 'height'])

        with TestPipeline() as p:
            res = (p
                   | beam.Create([(str(i), i, float(i)) for i in range(5)])
                   | beam.Select(name=lambda r: str(r[0]),
                                 id=lambda r: int(r[1]),
                                 height=lambda r: float(r[2]))
                   | schemas.BatchRowsAsDataFrame(min_batch_size=10,
                                                  max_batch_size=10))
            assert_that(res, matches_df(expected))