示例#1
0
def prepare_count(df, direction):
    """
    This function is used to prepare flight count data used for flight analysis in terms of airports and states.
    Specially, the direction specifies if we are calculating the "DEPARTURE" or "ARRIVAL" flight number count.
    @param df: input flight dataFrame
    @type df: pd.DataFrame
    @param direction: input direction for analysis
    @type direction: str
    @return: count dataFrame
    @rtype: pd.DataFrame
    """
    assert isinstance(df, pd.DataFrame)
    assert isinstance(direction, str)
    assert direction == constants.DIRECTION_ARRIVAL or direction == constants.DIRECTION_DEPARTURE

    if direction == constants.DIRECTION_DEPARTURE:
        airport_type = 'ORIGIN'
        count_type = "ORIGIN_COUNT"
    elif direction == constants.DIRECTION_ARRIVAL:
        airport_type = 'DEST'
        count_type = "DEST_COUNT"

    df_delay = df[df['CANCELLED'] != 1]
    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    df_origin_counts = count(df_delay, airport_type, count_type)
    df_origin = merge(df_us_airport, df_origin_counts, 'iata_code',
                      airport_type).dropna()
    df_origin_by_state = aggregate(df_origin, 'iso_region', count_type)
    return df_origin, df_origin_by_state
示例#2
0
def get_airline_route_by_state(df_origin, df_dest, airline):
    """
    This function takes all of the origin flights and destination flights as input. It calculates the distribution of
    the flight routes in terms of different states. For example, for the given airline = "AA", it will return the flight
    number distribution to all reachable states.
    @param df_origin: the input origin dataFrame
    @type df_origin: pd.DataFrame
    @param df_dest: the input destination dataFrame
    @type df_dest: pd.DataFrame
    @param airline: the given airline code
    @type airline: str
    @return: flight number distribution dataFrame for this airline.
    @rtype: pd.DataFrame
    """
    assert isinstance(df_origin, pd.DataFrame)
    assert isinstance(df_dest, pd.DataFrame)
    assert isinstance(airline, str)

    us_division = read_csv_file(constants.US_REGION_DIVISION_DATA_PATH)
    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    df_airport_origin_cnts  = df_origin[df_origin['OP_CARRIER']==airline]['ORIGIN']\
        .value_counts().rename_axis('iata_code').reset_index(name='origin_counts')
    df_airport_dest_cnts  = df_dest[df_dest['OP_CARRIER']==airline]['DEST']\
        .value_counts().rename_axis('iata_code').reset_index(name='dest_counts')
    df_airport_route_cnts = merge(df_airport_origin_cnts, df_airport_dest_cnts,
                                  'iata_code', 'iata_code')
    df_airport_route_cnts['route_counts'] = df_airport_route_cnts[
        'origin_counts'] + df_airport_route_cnts['dest_counts']
    df_airport_route_cnts = merge(
        df_airport_route_cnts, df_us_airport, 'iata_code',
        'iata_code')[['iata_code', 'route_counts', 'iso_region']]
    df_state_route_cnts = aggregate(df_airport_route_cnts, 'iso_region',
                                    'route_counts')
    df_region_route_cnts = pd.merge(df_state_route_cnts,
                                    us_division,
                                    left_on='iso_region',
                                    right_on='State Code')
    df_region_route_cnts = aggregate(df_region_route_cnts, 'Region',
                                     'route_counts')

    return df_region_route_cnts
示例#3
0
def get_flight_data_by_year(year, used_cols=[]):
    """
    This function get the flight data for the given year and only returns given columns
    @param year: input year
    @type year: int
    @param used_cols: the input columns list
    @type used_cols: list
    @return: flight dataframe
    @rtype: pd.DataFrame
    """
    assert isinstance(year, int)
    assert isinstance(used_cols, list)

    df_year = read_csv_file(constants.ROOT + str(year) + '.csv')
    if not used_cols:
        return df_year
    return df_year[used_cols]
示例#4
0
def count_cancellation_by_airport():
    """
    This function returns the statistics for cancellation reasons and cancellation records for different airports
    """
    code_a = []
    code_b = []
    code_c = []
    code_d = []
    all_records = pd.DataFrame()
    cancel_records = pd.DataFrame()

    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    for year in constants.YEAR_LIST:
        df_cur = get_flight_data_by_year(year, [])

        df_all = df_cur[['FL_DATE', 'ORIGIN']]
        df_all['month'] = df_all['FL_DATE'].str.split('-').str[1]
        df_all = merge(df_us_airport, df_all, 'iata_code', 'ORIGIN')
        df_all = df_all[['iso_region', 'month']].dropna()
        df_all = df_all.groupby(['iso_region',
                                 'month']).size().reset_index(name='counts')
        all_records = all_records.append(df_all)
        del df_all

        df_cancel = df_cur[df_cur['CANCELLED'] != 0]
        df_cancel = merge(df_us_airport, df_cancel, 'iata_code', 'ORIGIN')
        df_cancel = df_cancel[['FL_DATE', 'iso_region',
                               'CANCELLATION_CODE']].dropna()
        df_cancel['FL_DATE'] = df_cancel['FL_DATE'].str.rsplit(pat='-',
                                                               n=1).str[0]

        a = df_cancel[df_cancel['CANCELLATION_CODE'] == 'A'].shape[0]
        b = df_cancel[df_cancel['CANCELLATION_CODE'] == 'B'].shape[0]
        c = df_cancel[df_cancel['CANCELLATION_CODE'] == 'C'].shape[0]
        d = df_cancel[df_cancel['CANCELLATION_CODE'] == 'D'].shape[0]
        code_a.append(a)
        code_b.append(b)
        code_c.append(c)
        code_d.append(d)
        cancel_records = cancel_records.append(
            df_cancel.reset_index(drop=True))
    return all_records, cancel_records, code_a, code_b, code_c, code_d
示例#5
0
def prepare_delay(df, direction):
    """
    This function is used to prepare delay data used for flight analysis in terms of airports and states. Specially, the
    direction specifies if we are calculating the "DEPARTURE" or "ARRIVAL" delay.
    @param df: input flight dataFrame
    @type df: pd.DataFrame
    @param direction: input direction for analysis
    @type direction: str
    @return: delay dataFrame
    @rtype: pd.DataFrame
    """
    assert isinstance(df, pd.DataFrame)
    assert isinstance(direction, str)
    assert direction == constants.DIRECTION_ARRIVAL or direction == constants.DIRECTION_DEPARTURE

    if direction == constants.DIRECTION_DEPARTURE:
        airport_type = 'ORIGIN'
        delay_type = 'DEP_DELAY'
        count_type = "ORIGIN_COUNT"
    elif direction == constants.DIRECTION_ARRIVAL:
        airport_type = 'DEST'
        delay_type = 'ARR_DELAY'
        count_type = "DEST_COUNT"

    df_delay = df[df['CANCELLED'] != 1]
    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    # ORIGIN counts_origin
    df_cnts = count(df_delay, airport_type, count_type)

    # ORIGIN DEP_DELAY
    df_delay_cnts = aggregate(df_delay, airport_type, delay_type)

    # ORIGIN DEP_DELAY counts_origin
    df_delay_cnts = merge(df_cnts, df_delay_cnts, airport_type, airport_type)
    # flights>50
    df_delay_cnts = df_delay_cnts[df_delay_cnts[count_type] > 50]

    # airport-info ORIGIN DEP_DELAY counts_origin
    df_delay_by_airport = merge(df_us_airport, df_delay_cnts, 'iata_code',
                                airport_type).dropna()
    # state total DEP_DELAY
    df_delay_by_state = df_delay_by_airport.groupby(['iso_region']) \
        .agg({delay_type: sum}) \
        .rename_axis('iso_region') \
        .reset_index()
    # state flights
    df_cnts_by_state = df_delay_by_airport.groupby(['iso_region']) \
        .agg({count_type: sum}) \
        .rename_axis('iso_region') \
        .reset_index()

    # state flights total DEP_DELAY
    df_delay_cnts_by_state = pd.merge(df_delay_by_state,
                                      df_cnts_by_state,
                                      on='iso_region')

    # state flights average DEP_DELAY
    df_delay_by_state = average(df_delay_cnts_by_state, delay_type, count_type)

    # airport-info average DEP_DELAY
    df_delay_by_airport = average(df_delay_by_airport, delay_type, count_type)

    return df_delay_by_airport, df_delay_by_state