def taxi_regex_patterns(taxi_type='all'):
    """Creates a regex pattern for specified taxi type.

    Parameters
    ----------
    taxi_type : str
        Taxi type to create regex for (fhv, green, yellow, or all).

    Returns
    -------
    pattern : regex
        Regex pattern for specified taxi type.

    Notes
    -----
    """

    # define taxi type regex pattern
    if taxi_type == 'fhv':
        pattern = re.compile('fhv_tripdata_.+.csv')
    elif taxi_type == 'green':
        pattern = re.compile('green_tripdata_.+.csv')
    elif taxi_type == 'yellow':
        pattern = re.compile('yellow_tripdata_.+.csv')
    elif taxi_type == 'all':
        pattern = re.compile('(fhv|green|yellow)\_tripdata_.+.csv')
    else:
        output('Unknown taxi_type.', fn_str='regex_pattern')
        return None

    return pattern
Exemplo n.º 2
0
def load_loaddate(date, load_type, dl_dir, verbose=0):
    """Loads a nyiso load data file (one day of data) into a dataframe.
    Assumes the file is zipped with other files for that month.

    Parameters
    ----------
    date : str
        Date to load data for. Assumes 'yearmonthday' format (e.g. '20121030').

    load_type : str
        Defines type of load data. Current valid arguments: 'palIntegrated' (
        integrated real-time) and 'isolf' (load forecast).

    dl_dir : str
        Path to the directory containing downloaded zip files. Assumes each
        zip file is of the following format:
        'yearmonth01{load_type}_csv.zip' (e.g. '20121001palIntegrated_csv.zip').

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe of one day of load data.

    Notes
    -----
    """

    if verbose >= 2:
        output('Started loading {load_type} file for {date} from '
               '\"{dl_dir}\".'.format(load_type=load_type, date=date,
                                  dl_dir=dl_dir))

    if load_type not in ['palIntegrated', 'isolf']:
        raise ValueError('Unknown type argument: {load_type}. See docs for '
                         'valid types'.format(load_type=load_type))
    elif len(date) != 8:
        raise ValueError('Incorrect format for date argument: {date}. Must '
                         'be yearmonthday with 8 characters.'.format(date=date))

    # read file into dataframe
    zip_path = dl_dir + date[0:6] + '01{load_type}_csv.zip'.format(
        load_type=load_type)
    file_path = date + load_type + '.csv'
    with zipfile.ZipFile(zip_path) as zip_file:
        with zip_file.open(file_path) as csv_file:
            df = pd.read_csv(csv_file)

    if verbose >= 2:
        output('Finished loading {load_type} file for {date} from '
               '\"{dl_dir}\".'.format(load_type=load_type, date=date,
                                  dl_dir=dl_dir))

    return df
def clean_payment_type(df, verbose=0):
    """Cleans the payment_type column.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with cleaned column.

    Notes
    -----
    """

    col_names = list(pd.Series(df.columns.values))
    if 'payment_type' in col_names:

        # replace payment_type values with IDs
        payment_str = 'payment_type'
        df[payment_str] = df[payment_str].replace(['Credit', 'CREDIT', 'CRE',
                                                   'Cre', 'CRD'], '1')
        df[payment_str] = df[payment_str].replace(['CASH', 'Cash', 'CAS',
                                                   'Cas', 'CSH'], '2')
        df[payment_str] = df[payment_str].replace(['No', 'No ', 'No Charge',
                                                   'NOC'], '3')
        df[payment_str] = df[payment_str].replace(['Dis', 'DIS', 'Dispute'],
                                                  '4')
        df[payment_str] = df[payment_str].replace(['UNK', 'C', 'NA', 'NA '],
                                                  '5')
        df[payment_str] = df[payment_str].replace(['Voided trip'], '6')
        df[payment_str] = df[payment_str].astype('int')
        if verbose >= 2:
            output('Finished replacing ' + payment_str + ' with IDs.')

        # check that values match expected
        expected_values = [1, 2, 3, 4, 5, 6]
        match = check_expected_list(df, payment_str, expected_values,
                                    verbose=verbose)
        if not match:
            raise ValueError('Unexpected ' + payment_str + ' value(s).')

    elif verbose >= 2:
        output('Unable to clean payment_type column due to missing column.')

    return df
def add_trip_columns(df, verbose=0):
    """Adds calculated trip columns to the dataframe. Assumes the dataframe
    has already been cleaned. Also removes any trips with unreasonable
    values. Can only calculate distance-related trip data for records with
    pickup/dropoff lat/lon data.

    Parameters
    ----------
    df : dataframe
        Dataframe to add trip calculation columns to.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with added columns.

    Notes
    -----
    """

    col_names = list(pd.Series(df.columns.values))

    # add trip_duration column
    if ('dropoff_datetime' in col_names) and ('pickup_datetime' in col_names):
        df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']) \
                              / np.timedelta64(1, 's')
        if verbose >= 2:
            output('Finished adding trip duration column.')
    elif verbose >= 2:
        output('Unable to add trip_duration column due to missing columns.')

    # add calculated trip columns
    if ('pickup_longitude' in col_names) and \
       ('pickup_latitude' in col_names) and \
       ('dropoff_longitude' in col_names) and \
       ('dropoff_latitude' in col_names):

        # add trip_pace column
        df['trip_distance'].replace(0, np.nan, inplace=True)
        df['trip_pace'] = df['trip_duration'] / df['trip_distance']

        # add trip_straightline_distance column
        df['trip_straightline'] = haversine(df['pickup_latitude'],
                                            df['pickup_longitude'],
                                            df['dropoff_latitude'],
                                            df['dropoff_longitude'])

        # add trip_windingfactor column
        df['trip_windingfactor'] = df['trip_distance'] / df['trip_straightline']

        if verbose >= 2:
            output('Finished adding calculated trip columns.')
    elif verbose >= 2:
        output('Unable to add calculated trip columns due to missing columns.')

    return df
def clean_store_and_fwd_flag(df, verbose=0):
    """Cleans the store_and_fwd_flag column.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with cleaned column.

    Notes
    -----
    """

    col_names = list(pd.Series(df.columns.values))
    if 'store_and_fwd_flag' in col_names:

        # replace store_and_fwd_flag values with IDs
        store_str = 'store_and_fwd_flag'
        df[store_str] = df[store_str].replace(r'\s+', np.nan, regex=True)
        df[store_str] = df[store_str].replace(['*', '2', 2], np.nan)
        df[store_str] = df[store_str].replace(['N', '0'], 0)
        df[store_str] = df[store_str].replace(['Y', '1'], 1)
        df[store_str] = df[store_str].astype('float')
        df[store_str] = df[store_str].round()
        if verbose >= 2:
            output('Finished replacing ' + store_str + ' with IDs.')

        # check that values match expected
        expected_values = [0, 1, np.nan]
        match = check_expected_list(df, store_str, expected_values, verbose=verbose)
        if not match:
            raise ValueError('Unexpected ' + store_str + ' value(s).')

    elif verbose >= 2:
        output('Unable to clean store_and_fwd_flag column due to missing '
               'column.')

    return df
def clean_vendor_id(df, verbose=0):
    """Cleans the vendor_id column.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with cleaned column.

    Notes
    -----
    """

    col_names = list(pd.Series(df.columns.values))
    if 'vendor_id' in col_names:

        # replace vendor_id values with IDs
        vendor_str = 'vendor_id'
        df[vendor_str] = df[vendor_str].replace('CMT', '1')
        df[vendor_str] = df[vendor_str].replace('DDS', '3')
        df[vendor_str] = df[vendor_str].replace('VTS', '4')
        df[vendor_str] = df[vendor_str].astype('int')
        if verbose >= 2:
            output('Finished replacing ' + vendor_str + ' with IDs.')

        # check that values match expected
        expected_values = [1, 2, 3, 4]
        match = check_expected_list(df, vendor_str, expected_values,
                                    verbose=verbose)
        if not match:
            raise ValueError('Unexpected ' + vendor_str + ' value(s).')

    elif verbose >= 2:
        output('Unable to clean vendor_id column due to missing column.')

    return df
def clean_lat_lon(df, verbose=0):
    """Cleans the latitude and longitude columns.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with cleaned column.

    Notes
    -----
    """

    col_names = list(pd.Series(df.columns.values))
    if ('pickup_longitude' in col_names) and \
       ('pickup_latitude' in col_names) and \
       ('dropoff_longitude' in col_names) and \
       ('dropoff_latitude' in col_names):

        # replace lat/lon outside of possible ranges with nan
        df.loc[abs(df['pickup_latitude']) > 90, 'pickup_latitude'] = np.nan
        df.loc[abs(df['dropoff_latitude']) > 90, 'dropoff_latitude'] = np.nan
        df.loc[abs(df['pickup_longitude']) > 180, 'pickup_longitude'] = np.nan
        df.loc[abs(df['dropoff_longitude']) > 180, 'dropoff_longitude'] = np.nan
        if verbose >= 2:
            output('Finished replacing lat/lon outside of possible ranges with '
                   'nan.')

    elif verbose >= 1:
        output('Unable to clean lat/lon columns due to missing columns.')

    return df
def clean_column_names(df, year, verbose=0):
    """Cleans the dataframe column names. Column names are loosely based on
    "data_dictionary_trip_records_yellow.pdf".

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    year : int
        Year data comes from.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with cleaned column.

    Notes
    -----
    """

    # update column names
    df = df.rename(index=str, columns=col_names_dict(year))
    if verbose >= 2:
        output('Finished re-naming columns.')

    # add taxi_type column (2 for yellow)
    df.insert(0, 'taxi_type', 2)

    # check that column names match expected
    expected_names = ['taxi_type', 'vendor_id', 'pickup_datetime',
                      'dropoff_datetime', 'passenger_count', 'trip_distance',
                      'pickup_longitude', 'pickup_latitude',
                      'pickup_location_id', 'rate_code_id',
                      'store_and_fwd_flag', 'dropoff_longitude',
                      'dropoff_latitude', 'dropoff_location_id',
                      'payment_type', 'fare_amount', 'extra', 'mta_tax',
                      'improvement_surcharge', 'tip_amount', 'tolls_amount',
                      'total_amount']
    col_names = pd.Series(df.columns.values)
    col_names_in = col_names.isin(expected_names)
    if verbose >= 3:
        output('Column names: ')
        print(col_names)
        print('')
    if not all(col_names_in):
        col_names_not_in = [not i for i in col_names_in]
        output('Error : Unexpected column name(s).', 'clean_column_names')
        print(col_names[col_names_not_in])
        raise ValueError('Unexpected column name(s).')

    return df
Exemplo n.º 9
0
def create_forecast_err(db_path, load_table, forecast_table, overwrite=False,
                        verbose=0):
    """Creates a table and dataframe of load forecast error. Error is
    calculated as percent error relative to the actual load.

    I.e. error = (forecast - actual) / actual

    Parameters
    ----------
    db_path : str
        Path to sqlite database to create or connect to.

    load_table : str
        Name of the db table containing actual load data (i.e.
        based on palIntegrated data).

    forecast_table : str
        Name of the db table containing load forecast data (i.e. based on
        isolf).

    overwrite : bool
        Defines whether or not to overwrite existing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe written to db table.

    Notes
    -----
    """

    if verbose >= 1:
        output('Started creating or updating forecast_error table.')

    # query actual loads
    sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {load_table}
          ;""".format(load_table=load_table)
    df_load = query(db_path, sql)
    df_load['datetimeUTC'] = pd.to_datetime(df_load['datetimeUTC'])
    df_load = df_load.set_index(['datetimeUTC', 'zone_id'])

    # query forecast loads
    sql = """
            SELECT datetimeUTC, zone_id, load_forecast_p0, load_forecast_p1,
                load_forecast_p2, load_forecast_p3, load_forecast_p4, 
                load_forecast_p5, load_forecast_p6
            FROM {forecast_table}
          ;""".format(forecast_table=forecast_table)
    df_forecast = query(db_path, sql)
    df_forecast['datetimeUTC'] = pd.to_datetime(df_forecast['datetimeUTC'])
    df_forecast = df_forecast.set_index(['datetimeUTC', 'zone_id'])

    # calculate relative forecast errors
    df = pd.merge(df_load, df_forecast, how='inner', left_index=True,
                  right_index=True)
    del df_load, df_forecast
    df['forecast_error_p0'] = (df['load_forecast_p0'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p1'] = (df['load_forecast_p1'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p2'] = (df['load_forecast_p2'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p3'] = (df['load_forecast_p3'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p4'] = (df['load_forecast_p4'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p5'] = (df['load_forecast_p5'] -
                               df['integrated_load']) / df['integrated_load']
    df['forecast_error_p6'] = (df['load_forecast_p6'] -
                               df['integrated_load']) / df['integrated_load']
    df = df.drop(['load_forecast_p0', 'load_forecast_p1',
                  'load_forecast_p2', 'load_forecast_p3',
                  'load_forecast_p4', 'load_forecast_p5',
                  'load_forecast_p6'], axis=1)

    # create table
    sql = """
            CREATE TABLE IF NOT EXISTS forecast_error (
                rowid INTEGER PRIMARY KEY,
                datetimeUTC TEXT,
                zone_id INTEGER,
                integrated_load REAL,
                forecast_error_p0 REAL,
                forecast_error_p1 REAL,
                forecast_error_p2 REAL,
                forecast_error_p3 REAL,
                forecast_error_p4 REAL,
                forecast_error_p5 REAL,
                forecast_error_p6 REAL
          ); """
    indexes = ['CREATE UNIQUE INDEX IF NOT EXISTS '
               'forecast_error_datetimeUTC_zone_id ON forecast_error '
               '(datetimeUTC, zone_id);'
               ]
    create_table(db_path=db_path, table='forecast_error', create_sql=sql,
                 indexes=indexes,
                 overwrite=overwrite, verbose=verbose)

    # write data to table
    df_write = df.reset_index()
    df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize(
        None)
    df_to_table(db_path, df_write, table='forecast_error', overwrite=False,
                verbose=verbose)

    if verbose >= 1:
        output('Finished creating or updating forecast_error table. Dataframe '
               'shape is ' + str(df.shape) + '.')
    return df
Exemplo n.º 10
0
def create_expected_load(db_path, summary_table, zones_path,
                         datetimeUTC_range_ref, datetimeUTC_range_excl=None,
                         title=None, overwrite=False, verbose=0):
    """Creates a table and dataframe of expected data from the summary_table
    table. Expectation includes mean and variance of integrated_load for the
    specified reference datetime range. Expectation is calculated for every
    possible dayofweek-hour-zone combination, with NaNs for those missing data.

    Parameters
    ----------
    db_path : str
        Path to sqlite database to create or connect to.

    summary_table : str
        Name of the db summary table containing data to calculate
        expected integrated_load from.

    zones_path : str
        Path to csv containing all zone_id values (maps zone_id to zone_name).

    datetimeUTC_range_ref : tuple
        Specifies the start and end of the reference time period to use when
        calculating expected values (inclusive). Specify as a 2-element
        tuple of UTC datetime strings with year-month-day and
        hour:minutes:seconds.

    datetimeUTC_range_excl : tuple
        Specifies the start and end of time period to exclude from reference
        time period. Specify as a 2-element tuple of UTC datetime strings with
        year-month-day and hour:minutes:seconds.

    title : str
        Defines the suffix of the expected_load_[title] table to be created.

    overwrite : bool
        Defines whether or not to overwrite existing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_exp : dataframe
        Dataframe written to db table.

    Notes
    -----
    datetimeUTC_range_ref items should be UTC, but with naize format (since
    sqlite does not handle time zones). For example, use the following to
    select reference data for Jan. 1 - Dec. 31 2012 (Eastern):
    start = pd.Timestamp('2012-01-01 00:00:00', tz='America/New_York')
    end = pd.Timestamp('2012-12-31 23:59:59', tz='America/New_York')
    datetimeUTC_range_ref = (start.tz_convert(tz='UTC').tz_localize(None),
                            end.tz_convert(tz='UTC').tz_localize(None))
    """

    table = 'expected_load_{title}'.format(title=title)
    if verbose >= 1:
        output('Started creating or updating {table} table.'.format(
            table=table))

    # query range of zone_id values to consider
    df_zones = pd.read_csv(zones_path)
    zones = df_zones['zone_id'].unique()
    del df_zones

    # query reference data
    if datetimeUTC_range_excl:
        sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {summary_table}
            WHERE
                (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}")
                AND (datetimeUTC NOT BETWEEN "{start_datetime_excl}" AND 
                    "{end_datetime_excl}")
        ;""".format(summary_table=summary_table,
                    start_datetime=datetimeUTC_range_ref[0],
                    end_datetime=datetimeUTC_range_ref[1],
                    start_datetime_excl=datetimeUTC_range_excl[0],
                    end_datetime_excl=datetimeUTC_range_excl[1])
    else:
        sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {summary_table}
            WHERE
                (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}")
        ;""".format(summary_table=summary_table,
                    start_datetime=datetimeUTC_range_ref[0],
                    end_datetime=datetimeUTC_range_ref[1])
    df = query(db_path, sql)

    # add dayofweek (0 = Monday) and hour (0-23)
    df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC'])
    df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in
                         df['datetimeUTC']]
    df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in
                      df['datetimeUTC']]

    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour

    # calculate mean and variance for each dayofweek-hour-zone combination
    expected = []
    for dayofweek in range(7):
        for hour in range(24):
            for zone in zones:
                # filter to current dayofweek, hour, and zone
                df_filter = df[(df['dayofweek'] == dayofweek) &
                               (df['hour'] == hour) &
                               (df['zone_id'] == zone)]

                # calculate mean and variance
                if not df_filter.empty:
                    mean_integrated_load = np.mean(
                        df_filter['integrated_load'].values)
                    var_integrated_load = np.var(
                        df_filter['integrated_load'].values)
                    num_rows = df_filter.shape[0]
                    expected.append([dayofweek, hour, zone,
                                     mean_integrated_load, var_integrated_load,
                                     num_rows])
                else:
                    expected.append([dayofweek, hour, zone,
                                     np.nan, np.nan, np.nan])
    df_exp = pd.DataFrame(expected,
                          columns=['dayofweek', 'hour', 'zone_id',
                                   'mean_integrated_load',
                                   'var_integrated_load', 'num_rows'])
    df_exp.set_index(['dayofweek', 'hour', 'zone_id'])

    # create table
    sql = """
            CREATE TABLE IF NOT EXISTS {table} (
                rowid INTEGER PRIMARY KEY,
                dayofweek INTEGER,
                hour INTEGER,
                zone_id INTEGER,
                mean_integrated_load FLOAT,
                var_integrated_load FLOAT,
                num_rows INTEGER
            ); """.format(table=table)
    create_table(db_path=db_path, table=table, create_sql=sql, indexes=[],
                 overwrite=overwrite, verbose=verbose)

    # write data to table
    df_to_table(db_path, df_exp, table=table, overwrite=False,
                verbose=verbose)

    if verbose >= 1:
        output('Finished creating or updating {table} table. Dataframe shape '
               'is '.format(table=table) + str(df_exp.shape) + '.')

    return df_exp
Exemplo n.º 11
0
def clean_isolf(df, to_zoneid=False, zones_path=None, verbose=0):
    """Cleans a dataframe of nyiso load forecast data. Cleaning involves:
    renaming columns, converting datetimes, setting indexes, removing
    columns, converting to zone_id, and reshaping.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    to_zoneid : bool
        If True, converts zone names to zone ids, based on zones_path csv
        (zones_path must be defined if True). If False, leaves zones_name
        column.

    zones_path : str or None
        Path to csv mapping zone_id to zone_name. Required if to_zoneid is True.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Cleaned dataframe.

    Notes
    -----
    """

    if verbose >= 2:
        output('Started cleaning dataframe.')

    # clean column names
    df = df.rename(columns={'Time Stamp': 'datetimeNY',
                            'Capitl': 'CAPITL',
                            'Centrl': 'CENTRL',
                            'Dunwod': 'DUNWOD',
                            'Genese': 'GENESE',
                            'Hud Vl': 'HUD VL',
                            'Longil': 'LONGIL',
                            'Mhk Vl': 'MHK VL',
                            'Millwd': 'MILLWD',
                            'N.Y.C.': 'N.Y.C.',
                            'North': 'NORTH',
                            'West': 'WEST'})

    # clean datetime
    df['datetimeNY'] = pd.to_datetime(df['datetimeNY'], format='%m/%d/%Y %H:%M')

    if any(df.duplicated('datetimeNY')):
        # deal with ambiguous time zone due to end of DST (two 01:00 entries)
        transition_idx = next(
            i for i, val in enumerate(df.duplicated('datetimeNY'))
            if val)
        datetimes = []
        for i, val in enumerate(df['datetimeNY']):
            if i < transition_idx:
                datetimes.append(val.tz_localize(tz='America/New_York',
                                                 ambiguous=True))
            else:
                datetimes.append(val.tz_localize(tz='America/New_York',
                                                 ambiguous=False))
        df['datetimeNY'] = datetimes
    else:
        df['datetimeNY'] = [datetime.tz_localize(tz='America/New_York') for
                            datetime in df['datetimeNY']]

    # set index
    df = df.set_index('datetimeNY')

    # remove columns
    df = df.drop(['NYISO'], axis=1)

    # clean zone_id
    if to_zoneid:
        zone_col = 'zone_id'
        if zones_path:
            df_zones = pd.read_csv(zones_path)
            zones = dict(zip(df_zones['name'], df_zones['zone_id']))
            df = df.rename(columns=zones)
        else:
            raise ValueError('Must provide zones_path argument if to_zoneid is '
                             'True.')
    else:
        zone_col = 'zone_name'

    # reshape dataframe
    s = df.stack()
    s.index.names = ['datetimeNY', zone_col]
    df = pd.DataFrame(s.rename('load_forecast'))
    df = df.sort_index(level=0)
    dates = df.index.get_level_values(0).date
    for i, date in enumerate(pd.unique(dates)):
        print(date)
        col_name = 'load_forecast_p' + str(i)
        df[col_name] = np.nan
        s = df[dates == date]['load_forecast'].copy()
        s = s.rename(col_name)
        df.update(s)
    df = df.drop('load_forecast', axis=1)

    # add utc column
    datetimeUTC = [datetime.tz_convert('UTC') for datetime in
                   df.index.get_level_values(0)]
    df.insert(0, 'datetimeUTC', datetimeUTC)

    if verbose >= 2:
        output('Finished cleaning dataframe.')

    return df
Exemplo n.º 12
0
def clean_palint(df, to_zoneid=False, zones_path=None, verbose=0):
    """Cleans a dataframe of nyiso integrated real-time actual load data.
    Cleaning involves: renaming columns, converting datetimes (assumes ny
    timezone), converting to zone_id, removing columns, and setting indexes.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    to_zoneid : bool
        If True, converts zone names to zone ids, based on zones_path csv
        (zones_path must be defined if True). If False, leaves zones_name
        column.

    zones_path : str or None
        Path to csv mapping zone_id to zone_name. Required if to_zoneid is True.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Cleaned dataframe.

    Notes
    -----
    """

    if verbose >= 2:
        output('Started cleaning dataframe.')

    # clean column names
    df = df.rename(columns={'Time Stamp': 'datetime',
                            'Time Zone': 'timezone',
                            'Name': 'name',
                            'Integrated Load': 'integrated_load'})

    # clean datetime
    df['datetime'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M:%S')
    offset = df['timezone'].replace({'EDT': pd.Timedelta('4 hours'),
                                     'EST': pd.Timedelta('5 hours')})
    df['datetimeUTC'] = offset + pd.to_datetime(df['datetime'],
                                                format='%m/%d/%Y %H:%M:%S')
    df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in
                         df['datetimeUTC']]

    # clean zone_id
    if to_zoneid:
        zone_col = 'zone_id'
        if zones_path:
            df_zones = pd.read_csv(zones_path)
            zones = dict(zip(df_zones['name'], df_zones['zone_id']))
            print(df.keys())
            df['zone_id'] = df['name'].replace(zones)
        else:
            raise ValueError('Must provide zones_path argument if to_zoneid is '
                             'True.')
    else:
        zone_col = 'zone_name'

    # remove columns
    df = df[['datetimeUTC', zone_col, 'integrated_load']]

    # set index
    df = df.set_index(['datetimeUTC', zone_col])
    df = df.sort_index(level=0)

    if verbose >= 2:
        output('Finished cleaning dataframe.')

    return df
Exemplo n.º 13
0
def max_cross_corr(df, col1, col2, zone_col, shifts, min_overlap, verbose=0):
    """Creates a dataframe containing the time shift that maximizes
    cross-correlation between two time series, the max cross-correlation value,
    and the number of overlapping data points in those series.

    Parameters
    ----------
    df : Dataframe
        Dataframe to containing time series data (e.g. from
        create_timeseries). Assumes dataframe is multi-indexed by zone_col and
        timedelta (in hours).

    col1 : str
        Name of column containing first time series.

    col2 : str
        Name of column containing second time series. This is the shifted
        time series, where col2_shifted = col2 + shift.

    zone_col : str
        Name of spatial zone index.

    shifts : list
        List of time shifts to apply to 2nd time series (in hours).

    min_overlap : int
        Minimum number of overlapping data points (after the 2nd series is time
        shifted) needed to calculate cross-correlation.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_max_rho : dataframe
        Dataframe of max cross-correlations and associated shifts and counts.

    df_rho : dataframe
        Dataframe of cross-correlations and associated shifts and counts for
        all shifts.

    Notes
    -----
    """

    df_rho = pd.DataFrame(columns=['shift', zone_col, 'rho'])
    df_count = pd.DataFrame(columns=['shift', zone_col, 'count'])
    skipped = []
    zones = pd.unique(df.index.get_level_values(zone_col))
    for shift in shifts:
        for zone in zones:
            s_y1 = df[col1].xs(zone, level=0).dropna()
            s_y2 = df[col2].xs(zone, level=0).dropna()
            s_y1.index = pd.to_timedelta(s_y1.index.values, unit='h')
            s_y2.index = pd.to_timedelta(s_y2.index.values, unit='h')

            # shift 2nd time series
            s_y2_shift = s_y2.shift(1, freq=pd.Timedelta(shift, unit='h'))

            # skip zone if not enough overlapping data points (after shift)
            df_zone = pd.concat([s_y1, s_y2_shift], axis=1).dropna()
            num_overlap = df_zone.shape[0]
            if num_overlap < min_overlap:
                df_rho = df_rho.append(
                    {
                        'shift': shift,
                        zone_col: zone,
                        'rho': np.nan
                    },
                    ignore_index=True)
                skipped.append((shift, zone))
                continue

            # normalized cross-correlation
            rho = cross_corr(df_zone[col1].values, df_zone[col2].values, True)
            df_rho = df_rho.append({
                'shift': shift,
                zone_col: zone,
                'rho': rho
            },
                                   ignore_index=True)
            df_count = df_count.append(
                {
                    'shift': shift,
                    zone_col: zone,
                    'count': num_overlap
                },
                ignore_index=True)

    # reshape and get max rhos and associated shifts and counts
    df_rho = df_rho.set_index(['shift', zone_col])
    df_rho_reshape = df_rho.reset_index()
    df_rho_reshape = df_rho_reshape.pivot(index='shift',
                                          columns=zone_col,
                                          values='rho')
    s_max_shifts = df_rho_reshape.idxmax(axis=0)
    s_max_shifts.name = 'max-shift'
    s_max_rhos = df_rho_reshape.max(axis=0)
    s_max_rhos.name = 'max-rho'
    df_count = df_count.set_index(['shift', zone_col])
    max_counts = []
    for zone in zones:
        max_shift = s_max_shifts.loc[zone]
        if np.isnan(max_shift):
            max_counts.append(np.nan)
        else:
            max_counts.append(df_count.loc[max_shift, zone].item())
    s_max_counts = pd.Series(max_counts, index=zones)
    s_max_counts.name = 'max-count'
    df_max_rho = pd.concat([s_max_rhos, s_max_shifts, s_max_counts], axis=1)

    if verbose >= 2:
        output(
            'Skipped {num_skipped} (shift, {zone}) combos: {skipped}'.format(
                num_skipped=len(skipped), zone=zone_col, skipped=skipped))

    return df_max_rho, df_rho
Exemplo n.º 14
0
def load_nyiso(startdate, enddate, db_path, verbose=0):
    """Query and clean nyiso load forecast error data for the specified date
    range from a sqlite database. Assumes the database contains a
    forecast_error table created using create_forecast_err.

    Parameters
    ----------
    startdate : Timestamp
        Start date to include tweets from (inclusive), specified as a
        timezone-aware Pandas Timestamp object.
        E.g. startdate = pd.Timestamp('2012-10-28 00:00:00',
        tz='America/New_York')

    enddate : Timestamp
        End date to include tweets from (exclusive), specified as a
        timezone-aware Pandas Timestamp object.
        e.g. enddate = pd.Timestamp('2012-11-03 00:00:00',
        tz='America/New_York')

    db_path : str
        Path to sqlite database containing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe

    Notes
    -----
    Sqlite date queries are inclusive for start and end, forecast_error
    datetimes are UTC.
    """

    if verbose >= 1:
        output('Started query.')

    # convert datetimes
    startdateUTC = startdate.tz_convert('UTC')
    enddateUTC = enddate.tz_convert('UTC') - pd.Timedelta('1 second')
    startdate_sql = startdateUTC.strftime("%Y-%m-%d %H:%M:%S")
    enddate_sql = enddateUTC.strftime("%Y-%m-%d %H:%M:%S")

    # load nyiso load data
    sql = """
            SELECT datetimeUTC, zone_id AS nyiso_zone,
                forecast_error_p0 AS err0
            FROM forecast_error
            WHERE
                datetimeUTC BETWEEN "{startdate_sql}" AND "{enddate_sql}"
          ;""".format(startdate_sql=startdate_sql, enddate_sql=enddate_sql)
    df = query(db_path, sql)

    # convert datetimes
    df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC'])
    df['datetimeUTC'] = [
        datetime.tz_localize(tz='UTC') for datetime in df['datetimeUTC']
    ]
    df['datetimeNY'] = [
        datetime.tz_convert('America/New_York')
        for datetime in df['datetimeUTC']
    ]

    # add and drop columns
    df['percent-err0'] = df['err0'] * 100
    df = df.drop(['datetimeUTC'], axis=1)

    # index and sort
    df = df.set_index(['nyiso_zone', 'datetimeNY'])
    df = df.sort_index(level=0)

    if verbose >= 1:
        output('[min, max] forecast error datetimeNY: [' +
               str(min(df.index.get_level_values(level=1))) + ', ' +
               str(max(df.index.get_level_values(level=1))) + '].')
        output('[min, max] forecast error: [' + str(np.nanmin(df['err0'])) +
               ', ' + str(np.nanmax(df['err0'])) + '].')
        output('Finished query.')

    return df
Exemplo n.º 15
0
def dl_urls(url_path, dl_dir, taxi_type='all', verbose=0):
    """Downloads NYC TLC taxi record files for the specified taxi type into the
    specified directory, based on a text file containing urls.

    Parameters
    ----------
    url_path : str or None
        Path to text file containing NYC TLC taxi record file urls to
        download from. Does nothing if None.

    dl_dir : str
        Path of directory to download files to.

    taxi_type : str
        Taxi type to create regex for. Use None for all (fhv, green,
        and yellow).

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    dl_num : int
        Number of files downloaded.

    Notes
    -----
    url_path = '/Users/httran/Documents/projects/twitterinfrastructure/data
        /raw/nyctlc-triprecorddata/raw_data_urls.txt'
    dl_dir = '/Users/httran/Documents/projects/twitterinfrastructure/data/raw
        /nyctlc-triprecorddata/data/'
    """

    if not url_path:
        return

    if verbose >= 1:
        output('Started downloading taxi record files from ' +
               url_path + ' to ' + dl_dir)

    # get existing files in directory
    files = get_regex_files(dl_dir,
                            pattern=taxi_regex_patterns(taxi_type='all'))

    # get urls
    df_urls = pd.read_table(url_path, header=None, names=['url'])
    urls = df_urls.as_matrix()

    # download files for specified taxi type (skip already existing ones)
    dl_num = 0
    pattern = taxi_regex_patterns(taxi_type)
    for url in urls:
        parts = url[0].split('/')
        fname = parts[-1]
        if pattern.match(fname) and (fname not in files):
            urlretrieve(url[0], dl_dir + fname)
            output('downloaded: ' + fname)
            dl_num += 1

    if verbose >= 1:
        output('Downloaded ' + str(dl_num) + ' taxi record files from ' +
               url_path + ' to ' + dl_dir)

    return dl_num
Exemplo n.º 16
0
def load_nyctlc_zone_date(startdate,
                          enddate,
                          trip_type,
                          trip_count_filter,
                          db_path,
                          verbose=0):
    """Query and clean nyctlc dropoff or pickup data for the specified date
    range from a sqlite database, grouped by zone and date. Assumes the database
    contains a standard_zonedropoff_hour_sandy or
    standard_zonepickup_hour_sandy table created using
    create_standard_zone_hour.

    Parameters
    ----------
    startdate : Timestamp
        Start date to include tweets from (inclusive), specified as a
        timezone-aware Pandas Timestamp object.
        E.g. startdate = pd.Timestamp('2012-10-28 00:00:00',
        tz='America/New_York')

    enddate : Timestamp
        End date to include tweets from (exclusive), specified as a
        timezone-aware Pandas Timestamp object.
        e.g. enddate = pd.Timestamp('2012-11-03 00:00:00',
        tz='America/New_York')

    trip_type : str
        Trip type: 'dropoff' or 'pickup'.

    trip_count_filter : int
        Minimum number of trips required to load a data point.

    db_path : str
        Path to sqlite database containing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_taxi : dataframe

    Notes
    -----
    Sqlite date queries are inclusive for start and end, datetimes in nyctlc
    database are local (i.e. NY timezone).
    """

    df_taxi = load_nyctlc_zone_hour(startdate,
                                    enddate,
                                    trip_type,
                                    trip_count_filter,
                                    db_path,
                                    verbose=verbose)

    # remove index, adjust datetime to date, and group by zone and date
    df_taxi = df_taxi.reset_index()
    df_taxi['datetimeNY'] = pd.to_datetime(df_taxi['datetimeNY']).dt.date
    df_taxi = df_taxi.groupby(['location_id', 'datetimeNY']).mean()

    if verbose >= 1:
        if trip_type == 'dropoff':
            output('[min, max] taxi pace and trips mean z-score: [' +
                   str(np.nanmin(df_taxi['zpace-drop'])) + ', ' +
                   str(np.nanmax(df_taxi['zpace-drop'])) + '], [' +
                   str(np.nanmin(df_taxi['ztrips-drop'])) + ', ' +
                   str(np.nanmax(df_taxi['ztrips-drop'])) + '].')
        elif trip_type == 'pickup':
            output('[min, max] taxi pace and trips mean z-score: [' +
                   str(np.nanmin(df_taxi['zpace-pick'])) + ', ' +
                   str(np.nanmax(df_taxi['zpace-pick'])) + '], [' +
                   str(np.nanmin(df_taxi['ztrips-pick'])) + ', ' +
                   str(np.nanmax(df_taxi['ztrips-pick'])) + '].')

    return df_taxi
Exemplo n.º 17
0
def clean_yellow(df, year, month, verbose=0):
    """Cleans a dataframe of NYC TLC yellow taxi record data. Assumes all
    data is from the same year.

    Cleaning involves:
        - updating column names and adding taxi_type column
        - replacing vendor_id values with IDs
        - replacing store_and_fwd_flag values with IDs
        - replacing payment_type values with IDs
        - replacing lat/lon values outside of possible ranges with nans

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    year : int
        Year data comes from.

    month : int
        Month data comes from.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Cleaned dataframe.

    Notes
    -----
    vendor_id = {1: ['CMT', 'Creative Mobile Technologies, LLC'], 2: 'VeriFone
    Inc.', 3: 'DDS', 4: 'VTS'}
    """

    if verbose >= 1:
        output('Started cleaning dataframe for ' + str(year) + '-' +
               str(month) + '. ')
    nrows_removed = 0

    # clean column names
    df = clean_column_names(df, year, verbose)

    # clean datetime columns
    df, nrows_removed_datetime = clean_datetime(df, year, month, verbose)
    nrows_removed += nrows_removed_datetime

    # clean vendor_id column
    df = clean_vendor_id(df, verbose)

    # clean store_and_fwd_flag column
    df = clean_store_and_fwd_flag(df, verbose)

    # clean payment_type column
    df = clean_payment_type(df, verbose)

    # clean lat/lon columns and add calculated trip columns
    df = clean_lat_lon(df, verbose)
    df = add_trip_columns(df, verbose)

    if verbose >= 1:
        output('Cleaned dataframe for ' + str(year) + '-' + str(month) + '. ' +
               str(nrows_removed) + ' rows removed due to errors during clean.')

    return df
Exemplo n.º 18
0
def import_load_forecast(dl_dir, db_path, zones_path=None,
                         overwrite=False, verbose=0):
    """Loads, cleans, and imports nyiso load forecast data into a sqlite
    database.

    load_forecast_px column represents the forecast for the current
    row (i.e. datetime and zone) x days prior. E.g. the _p2 column for a row
    with datetime of 10/5/2012 01:00:00 contains the forecast for 10/5/2012
    01:00:00 from two days before (i.e. 10/3/2012).

    Parameters
    ----------
    dl_dir : str
        Path to the directory containing downloaded zip files. Imports
        all files in directory. Assumes each zip file is of the following
        format: 'yearmonth01isolf_csv.zip' (e.g. '20121001isolf_csv.zip').

    db_path : str
        Path to sqlite database.

    zones_path : str or None
        Path to csv mapping zone_id to zone_name. Required if to_zoneid is True.

    overwrite : bool
        Defines whether or not to overwrite existing database tables.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    import_num : int
        Number of files imported into database.

    Notes
    -----
    """

    # get files
    pattern = re.compile('\d{8}isolf_csv.zip')
    files = get_regex_files(dl_dir, pattern=pattern, verbose=verbose)

    # create load table (if needed)
    create_sql = """
                    CREATE TABLE IF NOT EXISTS load_forecast (
                        rowid INTEGER PRIMARY KEY,
                        datetimeNY TEXT,
                        datetimeUTC TEXT,
                        zone_id INTEGER,
                        load_forecast_p0 REAL,
                        load_forecast_p1 REAL,
                        load_forecast_p2 REAL,
                        load_forecast_p3 REAL,
                        load_forecast_p4 REAL,
                        load_forecast_p5 REAL,
                        load_forecast_p6 REAL
                    ); """
    indexes = ['CREATE INDEX IF NOT EXISTS load_forecast_datetimeNY_zone_id '
               'ON load_forecast (datetimeNY, zone_id);',
               'CREATE UNIQUE INDEX IF NOT EXISTS '
               'load_forecast_datetimeUTC_zone_id ON load_forecast '
               '(datetimeUTC, zone_id);'
               ]
    create_table(db_path, 'load_forecast', create_sql, indexes=indexes,
                 overwrite=overwrite,
                 verbose=verbose)

    # load, clean, and import load data into table
    import_num = 0
    for file in files:
        if verbose >= 1:
            output('Started importing \"' + file + '\".')
        date = pd.Timestamp(file[0:8]).date()
        last_day = calendar.monthrange(date.year, date.month)[1]
        start_date = pd.Timestamp(year=date.year, month=date.month, day=1)
        end_date = pd.Timestamp(year=date.year, month=date.month, day=last_day)
        dates = pd.date_range(start_date, end_date)
        for date in dates:
            date_str = date.strftime('%Y%m%d')

            # load and clean data for current date
            df = load_loaddate(date_str, load_type='isolf',
                               dl_dir=dl_dir, verbose=verbose)
            df = clean_isolf(df, to_zoneid=True, zones_path=zones_path,
                             verbose=verbose)

            # write to database
            conn = connect_db(db_path)
            c = conn.cursor()
            df_write = df.reset_index()
            df_write['datetimeNY'] = df_write['datetimeNY'].dt.tz_localize(None)
            df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize(
                None)
            for index, row in df_write.iterrows():
                dtNY = row['datetimeNY']
                dtUTC = row['datetimeUTC']
                zone = row['zone_id']
                val = row.drop(
                    ['datetimeNY', 'zone_id', 'datetimeUTC']).dropna()
                col_name = val.index.values[0]
                sql = """
                    INSERT INTO load_forecast (datetimeNY, datetimeUTC, zone_id, 
                        {col_name})
                    VALUES ("{dtNY}", "{dtUTC}", {zone}, {val})
                    ON CONFLICT(datetimeUTC, zone_id) DO
                    UPDATE SET {col_name} = excluded.{col_name}
                ;""".format(col_name=col_name, val=val[0], dtNY=dtNY,
                            dtUTC=dtUTC, zone=zone)
                c.execute(sql)
                conn.commit()
            conn.close()

            import_num += 1
        if verbose >= 1:
            output('Finished importing \"' + file + '\".')
    output('Finished importing ' + str(import_num) +
           ' files from \"{dl_dir}\".'.format(dl_dir=dl_dir))

    return import_num
Exemplo n.º 19
0
def load_yellow(path, nrows=None, usecols=None, verbose=0):
    """Loads an NYC TLC yellow taxi record file (one month of data) into a
    dataframe.

    Parameters
    ----------
    path : str
        Path to NYC TLC taxi record file to load.

    nrows : int or None
        Number of rows to read. Set to None to read all rows.

    usecols : list
        List of column names to include. Specify columns names as strings.
        Column names can be entered based on names found in original tables
        for the year specified or names found in the trips table. Set to None to
        read all columns.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe of one month of cleaned yellow taxi data.

    year : int
        Year data is from.

    month : int
        Month data is from.

    Notes
    -----
    path = '/Users/httran/Documents/projects/twitterinfrastructure/data/raw
    /nyctlc-triprecorddata/data/yellow_tripdata_2012-01.csv'
    """

    if verbose >= 1:
        output('Started loading to dataframe: ' + path + '.')

    parts = re.split('[/_-]', path)
    year = int(parts[-2])

    parts2 = parts[-1].split('.')
    month = int(parts2[0])

    # adjusts usecols to correctly map to column names for the year data is from
    if usecols:
        col_dict = col_names_dict(year)
        usecols_year = []
        for col in usecols:
            if col in col_dict:
                usecols_year.append(col)
            else:
                col_name_year = [key for key, val in col_dict.items()
                                 if val == col]
                if col_name_year:
                    usecols_year.append(col_name_year[0])
                elif verbose > 1:
                    output('No matching usecols column name "' + col + '" for '
                           'year ' + str(year) + '.', 'load_yellow')
                else:
                    pass
    else:
        usecols_year = usecols

    # read file into dataframe
    df = pd.read_csv(path, nrows=nrows, usecols=usecols_year,
                     error_bad_lines=False,
                     warn_bad_lines=False)
    if verbose >= 1:
        output('Finished loading to dataframe: ' + path + '.')

    return df, year, month
Exemplo n.º 20
0
def import_trips(url_path, dl_dir, db_path, taxi_type, nrows=None, usecols=None,
                 overwrite=False, verbose=0):
    """Downloads, cleans, and imports nyc tlc taxi record files for the
    specified taxi type into a sqlite database.

    Parameters
    ----------
    url_path : str or None
        Path to text file containing nyc tlc taxi record file urls to
        download from. Set to None to skip download.

    dl_dir : str
        Path of directory to download files to or load files from.

    db_path : str
        Path to sqlite database.

    taxi_type : str
        Taxi type to create regex for ('fhv', 'green', 'yellow', or 'all').

    nrows : int or None
        Number of rows to read. Set to None to read all rows.

    usecols : list
        List of column names to include. Specify columns names as strings.
        Column names can be entered based on names found in original tables
        for the year specified or names found in the trips table. Set to None to
        read all columns.

    overwrite : bool
        Defines whether or not to overwrite existing database tables.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    import_num : int
        Number of files imported into database.

    Notes
    -----
    """

    # download taxi record files
    if url_path:
        dl_num = dl_urls(url_path, dl_dir, taxi_type, verbose=verbose)
    else:
        dl_num = 0

    # get taxi record files
    files = get_regex_files(dl_dir, taxi_regex_patterns(taxi_type),
                            verbose=verbose)

    # create trips table (if needed)
    create_sql = """
                CREATE TABLE IF NOT EXISTS trips (
                    trip_id INTEGER PRIMARY KEY,
                    taxi_type INTEGER,
                    vendor_id INTEGER,
                    pickup_datetime TEXT,
                    dropoff_datetime TEXT,
                    passenger_count INTEGER,
                    trip_distance REAL,
                    pickup_longitude REAL,
                    pickup_latitude REAL,
                    pickup_location_id INTEGER,
                    dropoff_longitude REAL,
                    dropoff_latitude REAL,
                    dropoff_location_id INTEGER,
                    trip_duration REAL,
                    trip_pace REAL,
                    trip_straightline REAL,
                    trip_windingfactor REAL
                ); """
    indexes = ['CREATE INDEX IF NOT EXISTS trips_pickup_datetime ON trips '
               '(pickup_datetime);']
    create_table(db_path, 'trips', create_sql, indexes=indexes,
                 overwrite=overwrite, verbose=verbose)

    # load, clean, and import taxi files into table
    import_num = 0
    for file in files:
        if verbose >= 1:
            output('Started importing ' + file + '.')
        if taxi_type == 'fhv':
            df = pd.DataFrame({'taxi_type': []})
        elif taxi_type == 'green':
            df = pd.DataFrame({'taxi_type': []})
        elif taxi_type == 'yellow':
            df, year, month = load_yellow(dl_dir + file, nrows=nrows,
                                          usecols=usecols, verbose=verbose)
            df = clean_yellow(df, year, month, verbose=verbose)
            import_num += 1
        else:
            output('Unknown taxi_type.', fn_str='import_trips')
            df = pd.DataFrame({'taxi_type': []})

        df_to_table(db_path, df, table='trips', overwrite=False,
                    verbose=verbose)
        if verbose >= 1:
            output('Imported ' + file + '.')
    output('Finished importing ' + str(import_num) + ' files.')

    return dl_num, import_num
Exemplo n.º 21
0
def create_standard_load(db_path, summary_table, expected_table,
                         datetimeUTC_range, min_num_rows=5, title=None,
                         overwrite=False, verbose=0):
    """Creates a table and dataframe of standardized data from the
    summary_table table. Standardization is relative to the mean and variance of
    corresponding data from the specified reference datetime range (saved as
    an expected_load_[] table in the database).

    Parameters
    ----------
    db_path : str
        Path to sqlite database to create or connect to.

    summary_table : str
        Name of the db table containing summary data to calculate
        standardized integrated_load for.

    expected_table : str
        Name of the db table containing expected data (i.e. mean and
        variance) to calculate standardized integrated_load from.

    datetimeUTC_range : tuple
        Specifies the start and end of the time period to calculate
        standardized integrated_load for (inclusive). Specify as a 2-element
        tuple of UTC datetime strings with year-month-day and
        hour:minutes:seconds. E.g. ('2012-10-29 00:00:00', '2012-11-03
        23:59:59') to calculate standardized integrated_load for times between
        10/29/2012 and 11/03/2012.

    min_num_rows : int
        Defines the minimum number of rows needed in the reference set to
        standardize data.

    title : str
        Defines the suffix of the standard_load_[title] table to be created.

    overwrite : bool
        Defines whether or not to overwrite existing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_std : dataframe
        Dataframe written to db table.

    Notes
    -----
    """

    table = 'standard_load_{title}'.format(title=title)
    if verbose >= 1:
        output('Started creating or updating {table} table.'.format(
            table=table))

    # query expected values calculated from at least min_num_rows data points
    sql = """
            SELECT * FROM {expected_table} 
            WHERE num_rows >= {min_num_rows};""".format(
        expected_table=expected_table, min_num_rows=min_num_rows)
    df_exp = query(db_path, sql)
    df_exp = df_exp[['dayofweek', 'hour', 'zone_id', 'mean_integrated_load',
                     'var_integrated_load']]

    # query data to standardize
    sql = """
            SELECT datetimeUTC, zone_id, integrated_load
            FROM {summary_table}
            WHERE
                datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}";
            """.format(summary_table=summary_table,
                       start_datetime=datetimeUTC_range[0],
                       end_datetime=datetimeUTC_range[1])
    df = query(db_path, sql)

    # add dayofweek (0 = Monday) and hour (0-23)
    df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC'])
    df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in
                         df['datetimeUTC']]
    df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in
                      df['datetimeUTC']]
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour

    # calculate z-scores
    df = pd.merge(df, df_exp, how='left',
                  on=['dayofweek', 'hour', 'zone_id'])
    del df_exp
    df_std = df[['datetimeUTC', 'zone_id']]
    df_std['z_integrated_load'] = \
        (df['integrated_load'] - df['mean_integrated_load']) \
        / df['var_integrated_load']
    df_std = df_std.set_index(['datetimeUTC', 'zone_id'])
    del df

    # create table
    sql = """
                CREATE TABLE IF NOT EXISTS {table} (
                    rowid INTEGER PRIMARY KEY,
                    datetimeUTC TEXT,
                    zone_id INTEGER,
                    z_integrated_load FLOAT
                ); """.format(table=table)
    create_table(db_path=db_path, table=table, create_sql=sql, indexes=[],
                 overwrite=overwrite, verbose=verbose)

    # write data to table
    df_write = df_std.reset_index()
    df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize(
        None)
    df_to_table(db_path, df_write, table=table, overwrite=False,
                verbose=verbose)

    if verbose >= 1:
        output('Finished creating or updating {table} table. Dataframe shape '
               'is '.format(table=table) + str(df_std.shape) + '.')

    return df_std
Exemplo n.º 22
0
def load_nyctlc_zone_hour(startdate,
                          enddate,
                          trip_type,
                          trip_count_filter,
                          db_path,
                          verbose=0):
    """Query and clean nyctlc dropoff or pickup data for the specified date
    range from a sqlite database, grouped by zone and hour. Assumes the
    database contains a standard_zonedropoff_hour_sandy or
    standard_zonepickup_hour_sandy table created using
    create_standard_zone_hour.

    Parameters
    ----------
    startdate : Timestamp
        Start date to include tweets from (inclusive), specified as a
        timezone-aware Pandas Timestamp object.
        E.g. startdate = pd.Timestamp('2012-10-28 00:00:00',
        tz='America/New_York')

    enddate : Timestamp
        End date to include tweets from (exclusive), specified as a
        timezone-aware Pandas Timestamp object.
        e.g. enddate = pd.Timestamp('2012-11-03 00:00:00',
        tz='America/New_York')

    trip_type : str
        Trip type: 'dropoff' or 'pickup'.

    trip_count_filter : int
        Minimum number of trips required to load a data point.

    db_path : str
        Path to sqlite database containing table.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_taxi : dataframe

    Notes
    -----
    Sqlite date queries are inclusive for start and end, datetimes in nyctlc
    database are local (i.e. NY timezone).
    """

    if verbose >= 1:
        output('Started query.')

    # define trip type
    if trip_type not in ['dropoff', 'pickup']:
        raise ValueError(
            'Invalid trip_type argument: {arg}.'.format(arg=trip_type))

    # convert datetimes
    enddate_exclusive = enddate - pd.Timedelta('1 second')
    startdate_sql = startdate.strftime("%Y-%m-%d %H:%M:%S")
    enddate_sql = enddate_exclusive.strftime("%Y-%m-%d %H:%M:%S")

    # load dropoff/pickup data
    sql = """
            SELECT {trip_type}_datetime AS datetimeNY,
                {trip_type}_location_id AS location_id,
                z_mean_pace AS zpace, z_trip_count AS ztrips
            FROM standard_zone{trip_type}_hour_sandy
            WHERE
                trip_count > {trip_count_filter} AND
                {trip_type}_datetime BETWEEN
                "{startdate_sql}" AND "{enddate_sql}"
          ;""".format(trip_count_filter=trip_count_filter,
                      startdate_sql=startdate_sql,
                      enddate_sql=enddate_sql,
                      trip_type=trip_type)
    df_taxi = query(db_path, sql)

    # add columns
    df_taxi['abs-zpace'] = abs(df_taxi['zpace'])
    df_taxi['abs-ztrips'] = abs(df_taxi['ztrips'])

    # convert datetimes
    df_taxi['datetimeNY'] = pd.to_datetime(df_taxi['datetimeNY'])
    df_taxi['datetimeNY'] = [
        dt.tz_localize(tz='America/New_York') for dt in df_taxi['datetimeNY']
    ]

    # index and sort
    df_taxi = df_taxi.set_index(['location_id', 'datetimeNY'])
    df_taxi = df_taxi.sort_index(level=0)

    if verbose >= 1:
        output('[min, max] taxi datetimeNY (hourly): [' +
               str(min(df_taxi.index.get_level_values(level=1))) + ', ' +
               str(max(df_taxi.index.get_level_values(level=1))) + '].')
        output('[min, max] taxi pace and trips mean z-score (hourly): [' +
               str(np.nanmin(df_taxi['zpace'])) + ', ' +
               str(np.nanmax(df_taxi['zpace'])) + '], [' +
               str(np.nanmin(df_taxi['ztrips'])) + ', ' +
               str(np.nanmax(df_taxi['ztrips'])) + '].')

    # add drop or pick to column names
    if trip_type == 'dropoff':
        val = '-drop'
    elif trip_type == 'pickup':
        val = '-pick'
    else:
        pass
    col_dict = {}
    for col in df_taxi.columns.values:
        col_dict[col] = col + val
    df_taxi = df_taxi.rename(col_dict, axis='columns')

    return df_taxi
Exemplo n.º 23
0
def import_load(dl_dir, db_path, to_zoneid=False, zones_path=None,
                overwrite=False, verbose=0):
    """Loads, cleans, and imports nyiso load data into a sqlite database.
    Currently only imports palIntegrated files (i.e. integrated real-time
    load data).

    Parameters
    ----------
    dl_dir : str
        Path to the directory containing downloaded zip files. Imports
        all files in directory. Assumes each zip file is of the following
        format: 'yearmonth01palIntegrated_csv.zip' (e.g.
        '20121001palIntegrated_csv.zip').

    db_path : str
        Path to sqlite database.

    to_zoneid : bool
        If True, converts zone names to zone ids, based on zones_path csv
        (zones_path must be defined if True). If False, leaves zones_name
        column.

    zones_path : str or None
        Path to csv mapping zone_id to zone_name. Required if to_zoneid is True.

    overwrite : bool
        Defines whether or not to overwrite existing database tables.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    import_num : int
        Number of files imported into database.

    Notes
    -----
    """

    if to_zoneid:
        zone_str = 'zone_id'
        zone_field = 'zone_id INTEGER'
    else:
        zone_str = 'zone_name'
        zone_field = 'zone_name TEXT'

    # get files
    pattern = re.compile('\d{8}palIntegrated_csv.zip')
    files = get_regex_files(dl_dir, pattern=pattern, verbose=verbose)

    # create load table (if needed)
    create_sql = """
                CREATE TABLE IF NOT EXISTS load (
                    rowid INTEGER PRIMARY KEY,
                    datetimeUTC TEXT,
                    {zone_field} TEXT,
                    integrated_load REAL
                ); """.format(zone_field=zone_field)
    indexes = ['CREATE INDEX IF NOT EXISTS datetimeUTC_{zone_str} '
               'ON load (datetimeUTC, {zone_str});'.format(zone_str=zone_str)]
    create_table(db_path, 'load', create_sql, indexes=indexes,
                 overwrite=overwrite,
                 verbose=verbose)

    # load, clean, and import load data into table
    import_num = 0
    for file in files:
        if verbose >= 1:
            output('Started importing \"' + file + '\".')
        date = pd.Timestamp(file[0:8]).date()
        last_day = calendar.monthrange(date.year, date.month)[1]
        start_date = pd.Timestamp(year=date.year, month=date.month, day=1)
        end_date = pd.Timestamp(year=date.year, month=date.month, day=last_day)
        dates = pd.date_range(start_date, end_date)
        for date in dates:
            date_str = date.strftime('%Y%m%d')

            # load and clean data for current date
            df = load_loaddate(date_str, load_type='palIntegrated',
                               dl_dir=dl_dir, verbose=verbose)
            df = clean_palint(df, to_zoneid=to_zoneid, zones_path=zones_path,
                              verbose=verbose)

            # write to database
            df_write = df.reset_index()
            df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize(
                None)
            df_to_table(db_path, df_write, table='load', overwrite=False)
            del df_write

            import_num += 1
        if verbose >= 1:
            output('Finished importing \"' + file + '\".')
    output('Finished importing ' + str(import_num) +
           ' files from \"{dl_dir}\".'.format(dl_dir=dl_dir))

    return import_num
Exemplo n.º 24
0
def col_names_dict(year):
    """Returns a dictionary mapping column names for specified year to
    expected column names (i.e. those used in trips table).

    Parameters
    ----------
    year : int
        Year to define keys in column names dictionary.

    Returns
    -------
    col_dict : dict
        Dictionary mapping column names for specified year to expected.

    Notes
    -----
    """

    if year == 2009:
        col_dict = {
            'vendor_name': 'vendor_id',
            'Trip_Pickup_DateTime': 'pickup_datetime',
            'Trip_Dropoff_DateTime': 'dropoff_datetime',
            'Passenger_Count': 'passenger_count',
            'Trip_Distance': 'trip_distance',
            'Start_Lon': 'pickup_longitude',
            'Start_Lat': 'pickup_latitude',
            'Rate_Code': 'rate_code_id',
            'store_and_forward': 'store_and_fwd_flag',
            'End_Lon': 'dropoff_longitude',
            'End_Lat': 'dropoff_latitude',
            'Payment_Type': 'payment_type',
            'Fare_Amt': 'fare_amount',
            'surcharge': 'extra',
            'mta_tax': 'mta_tax',
            'Tip_Amt': 'tip_amount',
            'Tolls_Amt': 'tolls_amount',
            'Total_Amt': 'total_amount'
        }
    elif 2010 <= year <= 2013:
        col_dict = {
            'vendor_id': 'vendor_id',
            'pickup_datetime': 'pickup_datetime',
            'dropoff_datetime': 'dropoff_datetime',
            'passenger_count': 'passenger_count',
            'trip_distance': 'trip_distance',
            'pickup_longitude': 'pickup_longitude',
            'pickup_latitude': 'pickup_latitude',
            'rate_code': 'rate_code_id',
            'store_and_fwd_flag': 'store_and_fwd_flag',
            'dropoff_longitude': 'dropoff_longitude',
            'dropoff_latitude': 'dropoff_latitude',
            'payment_type': 'payment_type',
            'fare_amount': 'fare_amount',
            'surcharge': 'extra',
            'mta_tax': 'mta_tax',
            'tip_amount': 'tip_amount',
            'tolls_amount': 'tolls_amount',
            'total_amount': 'total_amount'
        }
    elif year == 2014:
        col_dict = {
            'vendor_id': 'vendor_id',
            ' pickup_datetime': 'pickup_datetime',
            ' dropoff_datetime': 'dropoff_datetime',
            ' passenger_count': 'passenger_count',
            ' trip_distance': 'trip_distance',
            ' pickup_longitude': 'pickup_longitude',
            ' pickup_latitude': 'pickup_latitude',
            ' rate_code': 'rate_code_id',
            ' store_and_fwd_flag': 'store_and_fwd_flag',
            ' dropoff_longitude': 'dropoff_longitude',
            ' dropoff_latitude': 'dropoff_latitude',
            ' payment_type': 'payment_type',
            ' fare_amount': 'fare_amount',
            ' surcharge': 'extra',
            ' mta_tax': 'mta_tax',
            ' tip_amount': 'tip_amount',
            ' tolls_amount': 'tolls_amount',
            ' total_amount': 'total_amount'
        }
    elif 2015 <= year <= 2016:
        col_dict = {
            'VendorID': 'vendor_id',
            'tpep_pickup_datetime': 'pickup_datetime',
            'tpep_dropoff_datetime': 'dropoff_datetime',
            'passenger_count': 'passenger_count',
            'trip_distance': 'trip_distance',
            'pickup_longitude': 'pickup_longitude',
            'pickup_latitude': 'pickup_latitude',
            'RatecodeID': 'rate_code_id',
            'store_and_fwd_flag': 'store_and_fwd_flag',
            'dropoff_longitude': 'dropoff_longitude',
            'dropoff_latitude': 'dropoff_latitude',
            'payment_type': 'payment_type',
            'fare_amount': 'fare_amount',
            'extra': 'extra',
            'mta_tax': 'mta_tax',
            'tip_amount': 'tip_amount',
            'tolls_amount': 'tolls_amount',
            'improvement_surcharge': 'improvement_surcharge',
            'total_amount': 'total_amount'
        }
    elif year == 2017:
        col_dict = {
            'VendorID': 'vendor_id',
            'tpep_pickup_datetime': 'pickup_datetime',
            'tpep_dropoff_datetime': 'dropoff_datetime',
            'passenger_count': 'passenger_count',
            'trip_distance': 'trip_distance',
            'RatecodeID': 'rate_code_id',
            'store_and_forward': 'store_and_fwd_flag',
            'PULocationID': 'pickup_location_id',
            'DOLocationID': 'dropoff_location_id',
            'payment_type': 'payment_type',
            'fare_amount': 'fare_amount',
            'extra': 'extra',
            'mta_tax': 'mta_tax',
            'tip_amount': 'tip_amount',
            'tolls_amount': 'tolls_amount',
            'improvement_surcharge': 'improvement_surcharge',
            'total_amount': 'total_amount'
        }
    else:
        output('Error : Unexpected year (' + str(year) + ').', 'col_names_dict')
        raise ValueError('Unexpected year.')

    return col_dict
Exemplo n.º 25
0
def create_timeseries(df, zone_col, min_count, write_path=None, verbose=0):
    """Creates a time series dataframe where each column of df is
    independently linearly interpolated over the total range of timedeltas of
    each zone. Only time series with at least min_count data points are
    included. Assumes the dataframe is indexed by a zone column (zone_col)
    and a timedelta column (e.g. using index_timedelta).

    Parameters
    ----------
    df : Dataframe
        Dataframe to calculate time series from.

    zone_col : str
        Name of zone column: 'zone_id' (nyiso zone), 'location_id' (taxi
        zone), or 'borough' (taxi borough).

    min_count : int
        Minimum number of data points needed to convert to a time series.

    write_path : str or None
        If str, then write a csv of the time series dataframe to the
        specified path. Else, do not write.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df_ts : dataframe

    Notes
    -----
    """

    # loop through zones
    df_ts = pd.DataFrame()
    skipped = []
    zones = pd.unique(df.index.get_level_values(zone_col))
    for zone in zones:
        df_zone = df.xs(zone, level=0)

        # loop through columns (i.e. data to convert to time series)
        y_interps = []
        cols = df_zone.columns.values
        for col in cols:
            s = df_zone[col].dropna()
            if s.count() < min_count:
                skipped.append((zone, col))
            else:
                timedeltas = range(
                    s.index.astype('timedelta64[h]').min(),
                    s.index.astype('timedelta64[h]').max() + 1)
                y_interp = pd.Series(data=np.interp(
                    timedeltas, s.index.astype('timedelta64[h]'), s.values),
                                     index=timedeltas,
                                     name=col)
                y_interps.append(y_interp)

        # add interpolated data to dataframe
        if y_interps:
            df_temp = pd.concat(objs=y_interps, axis=1, join='outer')
            df_temp = df_temp.set_index(
                pd.to_timedelta(df_temp.index.values, unit='h'))
            df_temp[zone_col] = zone
            df_temp.set_index(zone_col, append=True, inplace=True)
            df_temp.index.names = ['timedelta', zone_col]
            df_temp = df_temp.reorder_levels([1, 0])
            df_ts = df_ts.append(df_temp, sort=False)

    # save to csv
    if write_path:
        df_csv = df_ts.reset_index()
        df_csv['timedelta'] = df_csv['timedelta'].astype('timedelta64[h]')
        df_csv.to_csv(write_path, index=False)

    if verbose >= 1:
        output('skipped zones for having less than {min_count} data points '
               'in original column data: {skipped}'.format(
                   skipped=skipped, min_count=min_count))

    return df_ts
Exemplo n.º 26
0
def clean_datetime(df, year, month, verbose=0):
    """Cleans the datetime columns. Cleaning involves adjusting data type to
    datetime and removing records outside of expected year and month.

    Parameters
    ----------
    df : dataframe
        Dataframe to clean.

    year : int
        Year data comes from.

    month : int
        Month data comes from.

    verbose : int
        Defines verbosity for output statements.

    Returns
    -------
    df : dataframe
        Dataframe with cleaned column.

    nrows_removed : int
        Number of removed rows.

    Notes
    -----
    """

    col_names = list(pd.Series(df.columns.values))
    nrows_removed = 0
    if ('pickup_datetime' in col_names) and ('dropoff_datetime' in col_names):

        # change datetime columns datetime data type and sort
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
        df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
        df.sort_values(['pickup_datetime', 'dropoff_datetime'], inplace=True)
        if verbose >= 2:
            output('Finished converting datetime columns to datetime dtype '
                   'and sorting by pickup_datetime and dropoff_datetime.')

        # remove rows outside of expected year-month (based on pickup_datetime)
        start_datetime = pd.datetime(year=year, month=month, day=1,
                                     hour=0, minute=0, second=0, microsecond=0)
        if month < 12:
            end_datetime = pd.datetime(year=year, month=(month + 1), day=1,
                                       hour=0, minute=0, second=0,
                                       microsecond=0)
        else:
            end_datetime = pd.datetime(year=(year + 1), month=1, day=1,
                                       hour=0, minute=0, second=0,
                                       microsecond=0)
        correct_month = (df['pickup_datetime'] >= start_datetime) & \
                        (df['pickup_datetime'] < end_datetime)
        if not all(correct_month):
            nrows = df.shape[0]
            df = df[correct_month]
            nrows_removed = nrows - df.shape[0]
        if verbose >= 2:
            output('Finished removing records with pickup_datetime outside of '
                   'expected year-month date range (' + str(nrows_removed) +
                   ' rows removed).')
    elif verbose >= 1:
        output('Unable to clean datetime columns due to missing columns.')

    return df, nrows_removed