def get_baseline(start, location, direction, lane): xid, start, end = get_chart_params(start, location, direction, lane) query = ''' select d.date, waittime from datefeatures d left join predictions p on d.date = p.date and crossing_id = {1} and model_version = '{0}' where d.date >= '{2}' and d.date < '{3}' and (minute = 0 or minute=30) order by date ''' # Before 2016, baseline is computed from a rolling average # After 2016, baseline is same as last week of 2015 if start.year >= 2016: df = pd_query(query.format(BMODEL_POST2016, xid, start, end)) else: df = pd_query(query.format(BMODEL_PRE2016, xid, start, end)) return df
def setUp(self): query = ''' select c.date, waittime, year, month, dayofmonth, week, dayofweek, minofday from crossingdata c join datefeatures d on c.date = d.date where valid=1 and waittime is not null and crossing_id = 1 and dayofmonth = 5 and time = '12:00:00' order by c.date ''' self.df = pd_query(query) self.feature = ['year', 'month', 'dayofmonth', 'week', 'dayofweek', 'minofday'] self.label = 'waittime'
def select_mungedata_simple(munger_id, crossing_id, start_date, end_date): ''' Select data with date features only ''' query = ''' select m.date, metric as waittime, year, month, week, dayofweek, minofday from mungedata m join datefeatures d on m.date = d.date where crossing_id = {0} and munger_id = {1} and (minute = 0 or minute = 30) and is_waittime = true and m.date >= '{2}' and m.date < '{3}' order by m.date; ''' return pd_query(query.format(crossing_id, munger_id, start_date, end_date)).set_index('date')
def setUp(self): query = ''' select c.date, waittime, year, month, dayofmonth, week, dayofweek, minofday from crossingdata c join datefeatures d on c.date = d.date where valid=1 and waittime is not null and crossing_id = 1 and dayofmonth = 5 and time = '12:00:00' order by c.date ''' self.df = pd_query(query) self.feature = [ 'year', 'month', 'dayofmonth', 'week', 'dayofweek', 'minofday' ] self.label = 'waittime'
def smooth(munger_id, crossing_id, field, limit=None, path='../data', df=None): ''' Smooth data and write output to CSV IN munger_id crossing_id field: name of target field limit: string for limiting query; used for testing path: path to data directory df: dataframe to override default functionality of query raw data dataframe must have datetime index and data ordered by date OUT None ''' if df is None: query = ''' select c.date, {0} from crossingdata c join datefeatures d on c.date = d.date where valid=1 and {0} is not null and crossing_id = {1} order by c.date {2}; ''' if limit is not None: limitstring = "limit %s" % (limit) else: limitstring = "" df = pd_query(query.format(field, crossing_id, limitstring)) lowess = sm.nonparametric.lowess z = lowess(df[field], df.index, frac=12. / len(df), it=1) df['smooth'] = z[:, 1] df.smooth = df.smooth.clip_lower(0) dfcsv = df.reset_index()[['date', 'smooth']] dfcsv['munger_id'] = munger_id dfcsv['crossing_id'] = crossing_id dfcsv['is_waittime'] = field == 'waittime' filepath = '{0}/munge{1}_{2}_{3}.csv'.format(path, munger_id, crossing_id, field) dfcsv.to_csv( filepath, columns=['munger_id', 'crossing_id', 'date', 'smooth', 'is_waittime'], index=False, header=False) return df
def smooth(munger_id, crossing_id, field, limit=None, path='../data', df=None): ''' Smooth data and write output to CSV IN munger_id crossing_id field: name of target field limit: string for limiting query; used for testing path: path to data directory df: dataframe to override default functionality of query raw data dataframe must have datetime index and data ordered by date OUT None ''' if df is None: query = ''' select c.date, {0} from crossingdata c join datefeatures d on c.date = d.date where valid=1 and {0} is not null and crossing_id = {1} order by c.date {2}; ''' if limit is not None: limitstring = "limit %s" % (limit) else: limitstring = "" df = pd_query(query.format(field, crossing_id, limitstring)) lowess = sm.nonparametric.lowess z = lowess(df[field], df.index, frac=12. / len(df), it=1) df['smooth'] = z[:, 1] df.smooth = df.smooth.clip_lower(0) dfcsv = df.reset_index()[['date', 'smooth']] dfcsv['munger_id'] = munger_id dfcsv['crossing_id'] = crossing_id dfcsv['is_waittime'] = field == 'waittime' filepath = '{0}/munge{1}_{2}_{3}.csv'.format(path, munger_id, crossing_id, field) dfcsv.to_csv(filepath, columns=['munger_id', 'crossing_id', 'date', 'smooth', 'is_waittime'], index=False, header=False) return df
def get_prediction(start, location, direction, lane): xid, start, end = get_chart_params(start, location, direction, lane) query = ''' select d.date, waittime from datefeatures d left join predictions p on d.date = p.date and crossing_id = {1} and model_version = '{0}' where d.date >= '{2}' and d.date < '{3}' and (minute = 0 or minute=30) order by date ''' if start.year >= 2016: df = pd_query(query.format(PMODEL_POST2016, xid, start, end)) else: df = pd_query(query.format(PMODEL_PRE2016, xid, start, end)) return df
def get_actual(start, location, direction, lane): xid, start, end = get_chart_params(start, location, direction, lane) query = ''' select d.date, waittime from datefeatures d left join crossingdata c on d.date = c.date and crossing_id = {0} where d.date >= '{1}' and d.date < '{2}' and (minute = 0 or minute=30) order by date ''' df = pd_query(query.format(xid, start, end)) return df
def select_features_simple(start_date, end_date): ''' Select date features only ''' query = ''' select d.date, year, month, week, dayofweek, minofday from datefeatures d where d.date >= '{0}' and d.date < '{1}' and (minute = 0 or minute = 30) order by d.date; ''' return pd_query(query.format(start_date, end_date)).set_index('date')
def select_predictions(munger_id, model_version, crossing_id, start_date, end_date): ''' Select date features only ''' query = ''' select date, waittime from predictions where munger_id = {0} and model_version = '{1}' and crossing_id = {2} and date >= '{3}' and date < '{4}' order by date; ''' return pd_query(query.format(munger_id, model_version, crossing_id, start_date, end_date)).set_index('date')
def select_crossingdata(crossing_id, start_date): query = ''' select c.date, waittime, volume, year, month, week, dayofweek, minofday from crossingdata c join datefeatures d on c.date = d.date where crossing_id = {0} and (minute = 0 or minute = 30) and c.date >= '{1}' order by c.date; ''' return pd_query(query.format(crossing_id, start_date)).set_index('date')
def select_predictions(munger_id, model_version, crossing_id, start_date, end_date): ''' Select date features only ''' query = ''' select date, waittime from predictions where munger_id = {0} and model_version = '{1}' and crossing_id = {2} and date >= '{3}' and date < '{4}' order by date; ''' return pd_query( query.format(munger_id, model_version, crossing_id, start_date, end_date)).set_index('date')
def rolling_volume_aggregate(days, percent=.5): ''' Returns a dataframe with date and multiple rolling_means of the average aggregrate volume inbalance between north and south crossings IN days: list of days for rolling_means percent: percentage of values that can be missing for rolling_mean to not return NA OUT dataframe of form: date, vol_mean_1, vol_mean_2, etc. for each day in days; nulls excluded from output ''' series = pd_query('select date, volume from dailyvolume order by date')\ .set_index('date').volume df = pd.DataFrame() for day in days: df['vol_mean_{0}'.format(day)] = \ pd.rolling_mean(series, day, min_periods=day * percent).shift(1) df.index = pd.to_datetime(df.index) return df.dropna()
def select_mungedata(munger_id, crossing_id, start_date, end_date): query = ''' select d.date, metric as waittime, year, month, week, dayofweek, minofday, w.temp_max, w.temp_mean, w.temp_min, w.viz_max, w.wind_max, w.precip, w.rain, w.snow, w.fog, w.thunderstorm, wp1.temp_max as temp_max_p1, wp1.temp_mean as temp_mean_p1, wp1.temp_min as temp_min_p1, wp1.precip as precip_p1, wp1.rain as rain_p1, wp1.snow as snow_p1, wp1.thunderstorm as thunderstorm_p1, wp2.temp_max as temp_max_p2, wp2.temp_mean as temp_mean_p2, wp2.temp_min as temp_min_p2, wp2.precip as precip_p2, wp2.rain as rain_p2, wp2.snow as snow_p2, wp2.thunderstorm as thunderstorm_p2, wp3.temp_max as temp_max_p3, wp3.temp_mean as temp_mean_p3, wp3.temp_min as temp_min_p3, wp3.precip as precip_p3, wp3.rain as rain_p3, wp3.snow as snow_p3, wp3.thunderstorm as thunderstorm_p3, wm1.temp_max as temp_max_m1, wm1.temp_mean as temp_mean_m1, wm1.temp_min as temp_min_m1, wm1.precip as precip_m1, wm1.rain as rain_m1, wm1.snow as snow_m1, wm1.thunderstorm as thunderstorm_m1, wm2.temp_max as temp_max_m2, wm2.temp_mean as temp_mean_m2, wm2.temp_min as temp_min_m2, wm2.precip as precip_m2, wm2.rain as rain_m2, wm2.snow as snow_m2, wm2.thunderstorm as thunderstorm_m2, s.event, s_lead1.event as event_lead1, s_lag1.event as event_lag1, s_lead2.event as event_lead2, s_lag2.event as event_lag2, s_lead3.event as event_lead3, s_lag3.event as event_lag3, s_lead4.event as event_lead4, s_lag4.event as event_lag4 from datefeatures d left join mungedata m on m.date = d.date and crossing_id = {0} and munger_id = {1} and is_waittime = true left join publicholiday h on m.date::timestamp::date = h.date left join weather w on m.date::timestamp::date = w.date left join weather wp1 on m.date::timestamp::date = wp1.date - interval '1 day' left join weather wp2 on m.date::timestamp::date = wp2.date - interval '2 day' left join weather wp3 on m.date::timestamp::date = wp3.date - interval '3 day' left join weather wm1 on m.date::timestamp::date = wm1.date + interval '1 day' left join weather wm2 on m.date::timestamp::date = wm2.date + interval '2 day' left join specialdates s on m.date::timestamp::date = s.date left join specialdates s_lead1 on m.date::timestamp::date = s_lead1.date - interval '1 day' left join specialdates s_lag1 on m.date::timestamp::date = s_lag1.date + interval '1 day' left join specialdates s_lead2 on m.date::timestamp::date = s_lead2.date - interval '2 day' left join specialdates s_lag2 on m.date::timestamp::date = s_lag2.date + interval '2 day' left join specialdates s_lead3 on m.date::timestamp::date = s_lead3.date - interval '3 day' left join specialdates s_lag3 on m.date::timestamp::date = s_lag3.date + interval '3 day' left join specialdates s_lead4 on m.date::timestamp::date = s_lead4.date - interval '4 day' left join specialdates s_lag4 on m.date::timestamp::date = s_lag4.date + interval '4 day' where (minute = 0 or minute = 30) and d.date >= '{2}' and d.date < '{3}' order by d.date; ''' return pd_query(query.format(crossing_id, munger_id, start_date, end_date)).set_index('date')