def test_create_trips_analysis(self): analyze.create_trips_analysis(self.db_path, self.start_datetime, self.end_datetime, self.shapefile_path, self.taxizones_path, title='test', overwrite=True, create_zones_tables=True, verbose=0) sql_trips = 'SELECT * FROM trips_analysis_test;' df_test = query(self.db_path, sql_trips) sql_taxi_zones = 'SELECT * FROM taxi_zones_test;' df_zones = query(self.db_path, sql_taxi_zones) sql_taxi_boroughs = 'SELECT * FROM taxi_boroughs_test;' df_boroughs = query(self.db_path, sql_taxi_boroughs) assert list(df_test['trip_id']) == [4, 5, 6, 7, 8, 9] and \ list(df_test['pickup_location_id']) == [249, 90, 162, 162, 162, 162] and \ list(df_test['dropoff_location_id']) == [90, 161, 229, 229, 229, 229] and \ list(df_test['pickup_hour']) == [7, 15, 15, 15, 16, 16] assert df_zones.shape == (265, 4) assert list(df_boroughs['borough_id']) == [1, 2, 3, 4, 5, 6] and \ list(df_boroughs['borough_name']) == ['Bronx', 'Brooklyn', 'EWR', 'Manhattan', 'Queens', 'Staten Island']
def test_create_expected_load(self): summary_table = 'load' start_ref = pd.Timestamp('2012-10-01 00:00:00', tz='America/New_York') end_ref = pd.Timestamp('2012-10-20 23:59:59', tz='America/New_York') datetimeUTC_range_ref = (start_ref.tz_convert( tz='UTC').tz_localize(None), end_ref.tz_convert( tz='UTC').tz_localize(None)) start = pd.Timestamp('2012-10-01 00:00:00', tz='America/New_York') end = pd.Timestamp('2012-10-05 23:59:59', tz='America/New_York') datetimeUTC_range = (start.tz_convert(tz='UTC').tz_localize(None), end.tz_convert(tz='UTC').tz_localize(None)) _ = ny.create_expected_load(self.db_path, summary_table, self.zones_path, datetimeUTC_range_ref, datetimeUTC_range_excl=datetimeUTC_range, title='test', overwrite=True, verbose=0) sql = 'SELECT * FROM expected_load_test;' df_test = query(self.db_path, sql) assert (df_test.loc[2, ['mean_integrated_load']].values[0] == 1385) and \ (df_test.loc[2, ['num_rows']].values[0] == 2) and \ (df_test.shape == (1848, 7))
def test_import_load(self): import_num = ny.import_load(self.dl_dir, self.db_path, to_zoneid=True, zones_path=self.zones_path, overwrite=True, verbose=0) sql = 'SELECT * FROM load;' df_test = query(self.db_path, sql) assert (import_num == 31) and (df_test.loc[5, 'rowid'] == 6) and \ (df_test.loc[5, 'datetimeUTC'] == '2012-10-01 04:00:00') and \ (df_test.loc[5, 'zone_id'] == 6) and \ (df_test.loc[5, 'integrated_load'] == 1015.9) and \ (df_test.shape == (8184, 4))
def test_grouping_zone_hour(self): # test grouping by zone and hour analyze.create_summary_route_time( self.db_path, byborough=False, byday=False, title=self.title, trips_analysis_table=self.trips_analysis_table, taxi_zones_table=self.taxi_zones_table, overwrite=True, verbose=0) sql = 'SELECT * FROM summary_routezone_hour_{title};'.format( title=self.title) df_test = query(self.db_path, sql, parse_dates=False, verbose=0) assert list(df_test['pickup_location_id']) == [249, 90, 162, 162] and \ list(df_test['dropoff_location_id']) == [90, 161, 229, 229] and \ list(df_test['trip_count']) == [1, 1, 2, 2]
def test_import_trips(): url_path = None dl_dir = 'tests/nyctlc/raw/' db_path = 'tests/nyctlc/test.db' taxi_type = 'yellow' usecols = [ 'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'pickup_location_id', 'dropoff_longitude', 'dropoff_latitude', 'dropoff_location_id' ] dl_num, import_num = clean.import_trips(url_path, dl_dir, db_path, taxi_type, nrows=None, usecols=usecols, overwrite=True, verbose=0) sql = 'SELECT * FROM trips;' df_test = query(db_path, sql) assert dl_num == 0 and import_num == 2 and \ df_test['trip_id'][0] == 1 and \ df_test['taxi_type'][0] == 2 and \ df_test['vendor_id'][0] == 1 and \ df_test['pickup_datetime'][0] == '2012-10-01 07:57:00' and \ df_test['dropoff_datetime'][0] == '2012-10-01 07:57:00' and \ df_test['passenger_count'][0] == 1 and \ df_test['trip_distance'][0] == 0.2 and \ round(df_test['pickup_longitude'][0], 2) == -73.98 and \ round(df_test['pickup_latitude'][0], 2) == 40.79 and \ pd.isnull(df_test['pickup_location_id'][0]) and \ round(df_test['dropoff_longitude'][0], 2) == -73.98 and \ round(df_test['dropoff_latitude'][0], 2) == 40.79 and \ pd.isnull(df_test['dropoff_location_id'][0]) and \ df_test['trip_duration'][0] == 0 and \ df_test['trip_pace'][0] == 0 and \ round(df_test['trip_straightline'][0], 2) == 0.28 and \ round(df_test['trip_windingfactor'][0], 2) == 0.73
def test_grouping_zone_day(self): # test grouping by zone and day analyze.create_summary_route_time( self.db_path, byborough=False, byday=True, title=self.title, trips_analysis_table=self.trips_analysis_table, taxi_zones_table=self.taxi_zones_table, overwrite=True, verbose=0) sql = 'SELECT * FROM summary_routezone_day_{title};'.format( title=self.title) df_test = query(self.db_path, sql, parse_dates=False, verbose=0) assert list(df_test['pickup_location_id']) == [249, 90, 162] and \ list(df_test['dropoff_location_id']) == [90, 161, 229] and \ list(df_test['trip_count']) == [1, 1, 4] and \ [round(pace, 2) for pace in df_test['mean_pace']] == [183.21, 303.37, 334.26]
def test_gouping_borough_hour(self): # test grouping by borough and hour analyze.create_summary_route_time( self.db_path, byborough=True, byday=False, title=self.title, trips_analysis_table=self.trips_analysis_table, taxi_zones_table=self.taxi_zones_table, overwrite=True, verbose=0) sql = 'SELECT * FROM summary_routeborough_hour_{title};'.format( title=self.title) df_test = query(self.db_path, sql, parse_dates=False, verbose=0) assert list(df_test['pickup_borough_id']) == [4, 4, 4] and \ list(df_test['dropoff_borough_id']) == [4, 4, 4] and \ list(df_test['trip_count']) == [1, 3, 2] and \ [round(pace, 2) for pace in df_test['mean_pace']] == [183.21, 285.71, 400.00]
def test_create_standard_load(self): summary_table = 'load' expected_table = 'expected_load_test' start = pd.Timestamp('2012-10-01 00:00:00', tz='America/New_York') end = pd.Timestamp('2012-10-05 23:59:59', tz='America/New_York') datetimeUTC_range = (start.tz_convert(tz='UTC').tz_localize(None), end.tz_convert(tz='UTC').tz_localize(None)) _ = ny.create_standard_load(self.db_path, summary_table, expected_table, datetimeUTC_range, min_num_rows=1, title='test', overwrite=True, verbose=0) sql = 'SELECT * FROM standard_load_test;' df_test = query(self.db_path, sql) assert (round(df_test.loc[2, ['z_integrated_load']].values[0], 2) == -0.04) and (df_test.shape == (1320, 4))
def create_standard_load(db_path, summary_table, expected_table, datetimeUTC_range, min_num_rows=5, title=None, overwrite=False, verbose=0): """Creates a table and dataframe of standardized data from the summary_table table. Standardization is relative to the mean and variance of corresponding data from the specified reference datetime range (saved as an expected_load_[] table in the database). Parameters ---------- db_path : str Path to sqlite database to create or connect to. summary_table : str Name of the db table containing summary data to calculate standardized integrated_load for. expected_table : str Name of the db table containing expected data (i.e. mean and variance) to calculate standardized integrated_load from. datetimeUTC_range : tuple Specifies the start and end of the time period to calculate standardized integrated_load for (inclusive). Specify as a 2-element tuple of UTC datetime strings with year-month-day and hour:minutes:seconds. E.g. ('2012-10-29 00:00:00', '2012-11-03 23:59:59') to calculate standardized integrated_load for times between 10/29/2012 and 11/03/2012. min_num_rows : int Defines the minimum number of rows needed in the reference set to standardize data. title : str Defines the suffix of the standard_load_[title] table to be created. overwrite : bool Defines whether or not to overwrite existing table. verbose : int Defines verbosity for output statements. Returns ------- df_std : dataframe Dataframe written to db table. Notes ----- """ table = 'standard_load_{title}'.format(title=title) if verbose >= 1: output('Started creating or updating {table} table.'.format( table=table)) # query expected values calculated from at least min_num_rows data points sql = """ SELECT * FROM {expected_table} WHERE num_rows >= {min_num_rows};""".format( expected_table=expected_table, min_num_rows=min_num_rows) df_exp = query(db_path, sql) df_exp = df_exp[['dayofweek', 'hour', 'zone_id', 'mean_integrated_load', 'var_integrated_load']] # query data to standardize sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {summary_table} WHERE datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}"; """.format(summary_table=summary_table, start_datetime=datetimeUTC_range[0], end_datetime=datetimeUTC_range[1]) df = query(db_path, sql) # add dayofweek (0 = Monday) and hour (0-23) df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC']) df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in df['datetimeUTC']] df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in df['datetimeUTC']] df['dayofweek'] = df['datetime'].dt.dayofweek df['hour'] = df['datetime'].dt.hour # calculate z-scores df = pd.merge(df, df_exp, how='left', on=['dayofweek', 'hour', 'zone_id']) del df_exp df_std = df[['datetimeUTC', 'zone_id']] df_std['z_integrated_load'] = \ (df['integrated_load'] - df['mean_integrated_load']) \ / df['var_integrated_load'] df_std = df_std.set_index(['datetimeUTC', 'zone_id']) del df # create table sql = """ CREATE TABLE IF NOT EXISTS {table} ( rowid INTEGER PRIMARY KEY, datetimeUTC TEXT, zone_id INTEGER, z_integrated_load FLOAT ); """.format(table=table) create_table(db_path=db_path, table=table, create_sql=sql, indexes=[], overwrite=overwrite, verbose=verbose) # write data to table df_write = df_std.reset_index() df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize( None) df_to_table(db_path, df_write, table=table, overwrite=False, verbose=verbose) if verbose >= 1: output('Finished creating or updating {table} table. Dataframe shape ' 'is '.format(table=table) + str(df_std.shape) + '.') return df_std
def create_forecast_err(db_path, load_table, forecast_table, overwrite=False, verbose=0): """Creates a table and dataframe of load forecast error. Error is calculated as percent error relative to the actual load. I.e. error = (forecast - actual) / actual Parameters ---------- db_path : str Path to sqlite database to create or connect to. load_table : str Name of the db table containing actual load data (i.e. based on palIntegrated data). forecast_table : str Name of the db table containing load forecast data (i.e. based on isolf). overwrite : bool Defines whether or not to overwrite existing table. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Dataframe written to db table. Notes ----- """ if verbose >= 1: output('Started creating or updating forecast_error table.') # query actual loads sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {load_table} ;""".format(load_table=load_table) df_load = query(db_path, sql) df_load['datetimeUTC'] = pd.to_datetime(df_load['datetimeUTC']) df_load = df_load.set_index(['datetimeUTC', 'zone_id']) # query forecast loads sql = """ SELECT datetimeUTC, zone_id, load_forecast_p0, load_forecast_p1, load_forecast_p2, load_forecast_p3, load_forecast_p4, load_forecast_p5, load_forecast_p6 FROM {forecast_table} ;""".format(forecast_table=forecast_table) df_forecast = query(db_path, sql) df_forecast['datetimeUTC'] = pd.to_datetime(df_forecast['datetimeUTC']) df_forecast = df_forecast.set_index(['datetimeUTC', 'zone_id']) # calculate relative forecast errors df = pd.merge(df_load, df_forecast, how='inner', left_index=True, right_index=True) del df_load, df_forecast df['forecast_error_p0'] = (df['load_forecast_p0'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p1'] = (df['load_forecast_p1'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p2'] = (df['load_forecast_p2'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p3'] = (df['load_forecast_p3'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p4'] = (df['load_forecast_p4'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p5'] = (df['load_forecast_p5'] - df['integrated_load']) / df['integrated_load'] df['forecast_error_p6'] = (df['load_forecast_p6'] - df['integrated_load']) / df['integrated_load'] df = df.drop(['load_forecast_p0', 'load_forecast_p1', 'load_forecast_p2', 'load_forecast_p3', 'load_forecast_p4', 'load_forecast_p5', 'load_forecast_p6'], axis=1) # create table sql = """ CREATE TABLE IF NOT EXISTS forecast_error ( rowid INTEGER PRIMARY KEY, datetimeUTC TEXT, zone_id INTEGER, integrated_load REAL, forecast_error_p0 REAL, forecast_error_p1 REAL, forecast_error_p2 REAL, forecast_error_p3 REAL, forecast_error_p4 REAL, forecast_error_p5 REAL, forecast_error_p6 REAL ); """ indexes = ['CREATE UNIQUE INDEX IF NOT EXISTS ' 'forecast_error_datetimeUTC_zone_id ON forecast_error ' '(datetimeUTC, zone_id);' ] create_table(db_path=db_path, table='forecast_error', create_sql=sql, indexes=indexes, overwrite=overwrite, verbose=verbose) # write data to table df_write = df.reset_index() df_write['datetimeUTC'] = df_write['datetimeUTC'].dt.tz_localize( None) df_to_table(db_path, df_write, table='forecast_error', overwrite=False, verbose=verbose) if verbose >= 1: output('Finished creating or updating forecast_error table. Dataframe ' 'shape is ' + str(df.shape) + '.') return df
def create_expected_load(db_path, summary_table, zones_path, datetimeUTC_range_ref, datetimeUTC_range_excl=None, title=None, overwrite=False, verbose=0): """Creates a table and dataframe of expected data from the summary_table table. Expectation includes mean and variance of integrated_load for the specified reference datetime range. Expectation is calculated for every possible dayofweek-hour-zone combination, with NaNs for those missing data. Parameters ---------- db_path : str Path to sqlite database to create or connect to. summary_table : str Name of the db summary table containing data to calculate expected integrated_load from. zones_path : str Path to csv containing all zone_id values (maps zone_id to zone_name). datetimeUTC_range_ref : tuple Specifies the start and end of the reference time period to use when calculating expected values (inclusive). Specify as a 2-element tuple of UTC datetime strings with year-month-day and hour:minutes:seconds. datetimeUTC_range_excl : tuple Specifies the start and end of time period to exclude from reference time period. Specify as a 2-element tuple of UTC datetime strings with year-month-day and hour:minutes:seconds. title : str Defines the suffix of the expected_load_[title] table to be created. overwrite : bool Defines whether or not to overwrite existing table. verbose : int Defines verbosity for output statements. Returns ------- df_exp : dataframe Dataframe written to db table. Notes ----- datetimeUTC_range_ref items should be UTC, but with naize format (since sqlite does not handle time zones). For example, use the following to select reference data for Jan. 1 - Dec. 31 2012 (Eastern): start = pd.Timestamp('2012-01-01 00:00:00', tz='America/New_York') end = pd.Timestamp('2012-12-31 23:59:59', tz='America/New_York') datetimeUTC_range_ref = (start.tz_convert(tz='UTC').tz_localize(None), end.tz_convert(tz='UTC').tz_localize(None)) """ table = 'expected_load_{title}'.format(title=title) if verbose >= 1: output('Started creating or updating {table} table.'.format( table=table)) # query range of zone_id values to consider df_zones = pd.read_csv(zones_path) zones = df_zones['zone_id'].unique() del df_zones # query reference data if datetimeUTC_range_excl: sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {summary_table} WHERE (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}") AND (datetimeUTC NOT BETWEEN "{start_datetime_excl}" AND "{end_datetime_excl}") ;""".format(summary_table=summary_table, start_datetime=datetimeUTC_range_ref[0], end_datetime=datetimeUTC_range_ref[1], start_datetime_excl=datetimeUTC_range_excl[0], end_datetime_excl=datetimeUTC_range_excl[1]) else: sql = """ SELECT datetimeUTC, zone_id, integrated_load FROM {summary_table} WHERE (datetimeUTC BETWEEN "{start_datetime}" AND "{end_datetime}") ;""".format(summary_table=summary_table, start_datetime=datetimeUTC_range_ref[0], end_datetime=datetimeUTC_range_ref[1]) df = query(db_path, sql) # add dayofweek (0 = Monday) and hour (0-23) df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC']) df['datetimeUTC'] = [dtUTC.tz_localize(tz='UTC') for dtUTC in df['datetimeUTC']] df['datetime'] = [dtUTC.tz_convert(tz='America/New_York') for dtUTC in df['datetimeUTC']] df['dayofweek'] = df['datetime'].dt.dayofweek df['hour'] = df['datetime'].dt.hour # calculate mean and variance for each dayofweek-hour-zone combination expected = [] for dayofweek in range(7): for hour in range(24): for zone in zones: # filter to current dayofweek, hour, and zone df_filter = df[(df['dayofweek'] == dayofweek) & (df['hour'] == hour) & (df['zone_id'] == zone)] # calculate mean and variance if not df_filter.empty: mean_integrated_load = np.mean( df_filter['integrated_load'].values) var_integrated_load = np.var( df_filter['integrated_load'].values) num_rows = df_filter.shape[0] expected.append([dayofweek, hour, zone, mean_integrated_load, var_integrated_load, num_rows]) else: expected.append([dayofweek, hour, zone, np.nan, np.nan, np.nan]) df_exp = pd.DataFrame(expected, columns=['dayofweek', 'hour', 'zone_id', 'mean_integrated_load', 'var_integrated_load', 'num_rows']) df_exp.set_index(['dayofweek', 'hour', 'zone_id']) # create table sql = """ CREATE TABLE IF NOT EXISTS {table} ( rowid INTEGER PRIMARY KEY, dayofweek INTEGER, hour INTEGER, zone_id INTEGER, mean_integrated_load FLOAT, var_integrated_load FLOAT, num_rows INTEGER ); """.format(table=table) create_table(db_path=db_path, table=table, create_sql=sql, indexes=[], overwrite=overwrite, verbose=verbose) # write data to table df_to_table(db_path, df_exp, table=table, overwrite=False, verbose=verbose) if verbose >= 1: output('Finished creating or updating {table} table. Dataframe shape ' 'is '.format(table=table) + str(df_exp.shape) + '.') return df_exp
def load_nyiso(startdate, enddate, db_path, verbose=0): """Query and clean nyiso load forecast error data for the specified date range from a sqlite database. Assumes the database contains a forecast_error table created using create_forecast_err. Parameters ---------- startdate : Timestamp Start date to include tweets from (inclusive), specified as a timezone-aware Pandas Timestamp object. E.g. startdate = pd.Timestamp('2012-10-28 00:00:00', tz='America/New_York') enddate : Timestamp End date to include tweets from (exclusive), specified as a timezone-aware Pandas Timestamp object. e.g. enddate = pd.Timestamp('2012-11-03 00:00:00', tz='America/New_York') db_path : str Path to sqlite database containing table. verbose : int Defines verbosity for output statements. Returns ------- df : dataframe Notes ----- Sqlite date queries are inclusive for start and end, forecast_error datetimes are UTC. """ if verbose >= 1: output('Started query.') # convert datetimes startdateUTC = startdate.tz_convert('UTC') enddateUTC = enddate.tz_convert('UTC') - pd.Timedelta('1 second') startdate_sql = startdateUTC.strftime("%Y-%m-%d %H:%M:%S") enddate_sql = enddateUTC.strftime("%Y-%m-%d %H:%M:%S") # load nyiso load data sql = """ SELECT datetimeUTC, zone_id AS nyiso_zone, forecast_error_p0 AS err0 FROM forecast_error WHERE datetimeUTC BETWEEN "{startdate_sql}" AND "{enddate_sql}" ;""".format(startdate_sql=startdate_sql, enddate_sql=enddate_sql) df = query(db_path, sql) # convert datetimes df['datetimeUTC'] = pd.to_datetime(df['datetimeUTC']) df['datetimeUTC'] = [ datetime.tz_localize(tz='UTC') for datetime in df['datetimeUTC'] ] df['datetimeNY'] = [ datetime.tz_convert('America/New_York') for datetime in df['datetimeUTC'] ] # add and drop columns df['percent-err0'] = df['err0'] * 100 df = df.drop(['datetimeUTC'], axis=1) # index and sort df = df.set_index(['nyiso_zone', 'datetimeNY']) df = df.sort_index(level=0) if verbose >= 1: output('[min, max] forecast error datetimeNY: [' + str(min(df.index.get_level_values(level=1))) + ', ' + str(max(df.index.get_level_values(level=1))) + '].') output('[min, max] forecast error: [' + str(np.nanmin(df['err0'])) + ', ' + str(np.nanmax(df['err0'])) + '].') output('Finished query.') return df
def load_nyctlc_zone_hour(startdate, enddate, trip_type, trip_count_filter, db_path, verbose=0): """Query and clean nyctlc dropoff or pickup data for the specified date range from a sqlite database, grouped by zone and hour. Assumes the database contains a standard_zonedropoff_hour_sandy or standard_zonepickup_hour_sandy table created using create_standard_zone_hour. Parameters ---------- startdate : Timestamp Start date to include tweets from (inclusive), specified as a timezone-aware Pandas Timestamp object. E.g. startdate = pd.Timestamp('2012-10-28 00:00:00', tz='America/New_York') enddate : Timestamp End date to include tweets from (exclusive), specified as a timezone-aware Pandas Timestamp object. e.g. enddate = pd.Timestamp('2012-11-03 00:00:00', tz='America/New_York') trip_type : str Trip type: 'dropoff' or 'pickup'. trip_count_filter : int Minimum number of trips required to load a data point. db_path : str Path to sqlite database containing table. verbose : int Defines verbosity for output statements. Returns ------- df_taxi : dataframe Notes ----- Sqlite date queries are inclusive for start and end, datetimes in nyctlc database are local (i.e. NY timezone). """ if verbose >= 1: output('Started query.') # define trip type if trip_type not in ['dropoff', 'pickup']: raise ValueError( 'Invalid trip_type argument: {arg}.'.format(arg=trip_type)) # convert datetimes enddate_exclusive = enddate - pd.Timedelta('1 second') startdate_sql = startdate.strftime("%Y-%m-%d %H:%M:%S") enddate_sql = enddate_exclusive.strftime("%Y-%m-%d %H:%M:%S") # load dropoff/pickup data sql = """ SELECT {trip_type}_datetime AS datetimeNY, {trip_type}_location_id AS location_id, z_mean_pace AS zpace, z_trip_count AS ztrips FROM standard_zone{trip_type}_hour_sandy WHERE trip_count > {trip_count_filter} AND {trip_type}_datetime BETWEEN "{startdate_sql}" AND "{enddate_sql}" ;""".format(trip_count_filter=trip_count_filter, startdate_sql=startdate_sql, enddate_sql=enddate_sql, trip_type=trip_type) df_taxi = query(db_path, sql) # add columns df_taxi['abs-zpace'] = abs(df_taxi['zpace']) df_taxi['abs-ztrips'] = abs(df_taxi['ztrips']) # convert datetimes df_taxi['datetimeNY'] = pd.to_datetime(df_taxi['datetimeNY']) df_taxi['datetimeNY'] = [ dt.tz_localize(tz='America/New_York') for dt in df_taxi['datetimeNY'] ] # index and sort df_taxi = df_taxi.set_index(['location_id', 'datetimeNY']) df_taxi = df_taxi.sort_index(level=0) if verbose >= 1: output('[min, max] taxi datetimeNY (hourly): [' + str(min(df_taxi.index.get_level_values(level=1))) + ', ' + str(max(df_taxi.index.get_level_values(level=1))) + '].') output('[min, max] taxi pace and trips mean z-score (hourly): [' + str(np.nanmin(df_taxi['zpace'])) + ', ' + str(np.nanmax(df_taxi['zpace'])) + '], [' + str(np.nanmin(df_taxi['ztrips'])) + ', ' + str(np.nanmax(df_taxi['ztrips'])) + '].') # add drop or pick to column names if trip_type == 'dropoff': val = '-drop' elif trip_type == 'pickup': val = '-pick' else: pass col_dict = {} for col in df_taxi.columns.values: col_dict[col] = col + val df_taxi = df_taxi.rename(col_dict, axis='columns') return df_taxi