Python pd_query示例，dbhelper.pd_query Python示例

示例#1

0

显示文件

文件： predict.py 项目： jngseattle/BorderCrossing

def get_baseline(start, location, direction, lane):
    xid, start, end = get_chart_params(start, location, direction, lane)

    query = '''
            select
                d.date,
                waittime
            from datefeatures d
            left join predictions p
                on d.date = p.date
                and crossing_id = {1}
                and model_version = '{0}'
            where
                d.date >= '{2}' and d.date < '{3}'
                and (minute = 0 or minute=30)
            order by date
            '''

    # Before 2016, baseline is computed from a rolling average
    # After 2016, baseline is same as last week of 2015
    if start.year >= 2016:
        df = pd_query(query.format(BMODEL_POST2016, xid, start, end))
    else:
        df = pd_query(query.format(BMODEL_PRE2016, xid, start, end))

    return df

示例#2

0

显示文件

文件： predict.py 项目： jngseattle/BorderCrossing

def get_baseline(start, location, direction, lane):
    xid, start, end = get_chart_params(start, location, direction, lane)

    query = '''
            select
                d.date,
                waittime
            from datefeatures d
            left join predictions p
                on d.date = p.date
                and crossing_id = {1}
                and model_version = '{0}'
            where
                d.date >= '{2}' and d.date < '{3}'
                and (minute = 0 or minute=30)
            order by date
            '''

    # Before 2016, baseline is computed from a rolling average
    # After 2016, baseline is same as last week of 2015
    if start.year >= 2016:
        df = pd_query(query.format(BMODEL_POST2016, xid, start, end))
    else:
        df = pd_query(query.format(BMODEL_PRE2016, xid, start, end))

    return df

示例#3

0

显示文件

文件： test_BorderModel.py 项目： jngseattle/BorderCrossing

    def setUp(self):
        query = '''
                select
                    c.date,
                    waittime,
                    year,
                    month,
                    dayofmonth,
                    week,
                    dayofweek,
                    minofday
                from crossingdata c
                join datefeatures d on c.date = d.date
                where
                    valid=1
                    and waittime is not null
                    and crossing_id = 1
                    and dayofmonth = 5
                    and time = '12:00:00'
                    order by c.date
                '''

        self.df = pd_query(query)
        self.feature = ['year', 'month', 'dayofmonth', 'week', 'dayofweek',
                        'minofday']
        self.label = 'waittime'

示例#4

0

显示文件

def select_mungedata_simple(munger_id, crossing_id, start_date, end_date):
    '''
    Select data with date features only
    '''
    query = '''
            select
                m.date,
                metric as waittime,
                year,
                month,
                week,
                dayofweek,
                minofday
            from mungedata m
            join datefeatures d on m.date = d.date
            where
                crossing_id = {0}
                and munger_id = {1}
                and (minute = 0 or minute = 30)
                and is_waittime = true
                and m.date >= '{2}'
                and m.date < '{3}'
            order by m.date;
            '''

    return pd_query(query.format(crossing_id, munger_id, start_date,
                                 end_date)).set_index('date')

示例#5

0

显示文件

    def setUp(self):
        query = '''
                select
                    c.date,
                    waittime,
                    year,
                    month,
                    dayofmonth,
                    week,
                    dayofweek,
                    minofday
                from crossingdata c
                join datefeatures d on c.date = d.date
                where
                    valid=1
                    and waittime is not null
                    and crossing_id = 1
                    and dayofmonth = 5
                    and time = '12:00:00'
                    order by c.date
                '''

        self.df = pd_query(query)
        self.feature = [
            'year', 'month', 'dayofmonth', 'week', 'dayofweek', 'minofday'
        ]
        self.label = 'waittime'

示例#6

0

显示文件

文件： BorderQuery.py 项目： jngseattle/BorderCrossing

def select_mungedata_simple(munger_id, crossing_id, start_date, end_date):
    '''
    Select data with date features only
    '''
    query = '''
            select
                m.date,
                metric as waittime,
                year,
                month,
                week,
                dayofweek,
                minofday
            from mungedata m
            join datefeatures d on m.date = d.date
            where
                crossing_id = {0}
                and munger_id = {1}
                and (minute = 0 or minute = 30)
                and is_waittime = true
                and m.date >= '{2}'
                and m.date < '{3}'
            order by m.date;
            '''

    return pd_query(query.format(crossing_id,
                                 munger_id,
                                 start_date, end_date)).set_index('date')

示例#7

0

显示文件

文件： BorderModel.py 项目： jngseattle/BorderCrossing

def smooth(munger_id, crossing_id, field, limit=None, path='../data', df=None):
    '''
    Smooth data and write output to CSV

    IN
        munger_id
        crossing_id
        field: name of target field
        limit: string for limiting query; used for testing
        path: path to data directory
        df: dataframe to override default functionality of query raw data
            dataframe must have datetime index and data ordered by date
    OUT
        None
    '''
    if df is None:
        query = '''
            select
                c.date,
                {0}
            from crossingdata c
            join datefeatures d on c.date = d.date
            where
                valid=1
                and {0} is not null
                and crossing_id = {1}
            order by c.date {2};
            '''

        if limit is not None:
            limitstring = "limit %s" % (limit)
        else:
            limitstring = ""
            df = pd_query(query.format(field, crossing_id, limitstring))

    lowess = sm.nonparametric.lowess
    z = lowess(df[field], df.index, frac=12. / len(df), it=1)

    df['smooth'] = z[:, 1]
    df.smooth = df.smooth.clip_lower(0)

    dfcsv = df.reset_index()[['date', 'smooth']]
    dfcsv['munger_id'] = munger_id
    dfcsv['crossing_id'] = crossing_id
    dfcsv['is_waittime'] = field == 'waittime'

    filepath = '{0}/munge{1}_{2}_{3}.csv'.format(path, munger_id, crossing_id,
                                                 field)
    dfcsv.to_csv(
        filepath,
        columns=['munger_id', 'crossing_id', 'date', 'smooth', 'is_waittime'],
        index=False,
        header=False)

    return df

示例#8

0

显示文件

文件： BorderModel.py 项目： jngseattle/BorderCrossing

def smooth(munger_id, crossing_id, field, limit=None, path='../data', df=None):
    '''
    Smooth data and write output to CSV

    IN
        munger_id
        crossing_id
        field: name of target field
        limit: string for limiting query; used for testing
        path: path to data directory
        df: dataframe to override default functionality of query raw data
            dataframe must have datetime index and data ordered by date
    OUT
        None
    '''
    if df is None:
        query = '''
            select
                c.date,
                {0}
            from crossingdata c
            join datefeatures d on c.date = d.date
            where
                valid=1
                and {0} is not null
                and crossing_id = {1}
            order by c.date {2};
            '''

        if limit is not None:
            limitstring = "limit %s" % (limit)
        else:
            limitstring = ""
            df = pd_query(query.format(field, crossing_id, limitstring))

    lowess = sm.nonparametric.lowess
    z = lowess(df[field], df.index, frac=12. / len(df), it=1)

    df['smooth'] = z[:, 1]
    df.smooth = df.smooth.clip_lower(0)

    dfcsv = df.reset_index()[['date', 'smooth']]
    dfcsv['munger_id'] = munger_id
    dfcsv['crossing_id'] = crossing_id
    dfcsv['is_waittime'] = field == 'waittime'

    filepath = '{0}/munge{1}_{2}_{3}.csv'.format(path, munger_id,
                                                 crossing_id, field)
    dfcsv.to_csv(filepath,
                 columns=['munger_id', 'crossing_id', 'date',
                          'smooth', 'is_waittime'],
                 index=False, header=False)

    return df

示例#9

0

显示文件

文件： predict.py 项目： jngseattle/BorderCrossing

def get_prediction(start, location, direction, lane):
    xid, start, end = get_chart_params(start, location, direction, lane)

    query = '''
            select
                d.date,
                waittime
            from datefeatures d
            left join predictions p
                on d.date = p.date
                and crossing_id = {1}
                and model_version = '{0}'
            where
                d.date >= '{2}' and d.date < '{3}'
                and (minute = 0 or minute=30)
            order by date
            '''

    if start.year >= 2016:
        df = pd_query(query.format(PMODEL_POST2016, xid, start, end))
    else:
        df = pd_query(query.format(PMODEL_PRE2016, xid, start, end))

    return df

示例#10

0

显示文件

文件： predict.py 项目： jngseattle/BorderCrossing

def get_prediction(start, location, direction, lane):
    xid, start, end = get_chart_params(start, location, direction, lane)

    query = '''
            select
                d.date,
                waittime
            from datefeatures d
            left join predictions p
                on d.date = p.date
                and crossing_id = {1}
                and model_version = '{0}'
            where
                d.date >= '{2}' and d.date < '{3}'
                and (minute = 0 or minute=30)
            order by date
            '''

    if start.year >= 2016:
        df = pd_query(query.format(PMODEL_POST2016, xid, start, end))
    else:
        df = pd_query(query.format(PMODEL_PRE2016, xid, start, end))

    return df

示例#11

0

显示文件

文件： predict.py 项目： jngseattle/BorderCrossing

def get_actual(start, location, direction, lane):
    xid, start, end = get_chart_params(start, location, direction, lane)

    query = '''
            select
                d.date,
                waittime
            from datefeatures d
            left join crossingdata c
                on d.date = c.date
                and crossing_id = {0}
            where
                d.date >= '{1}' and d.date < '{2}'
                and (minute = 0 or minute=30)
            order by date
            '''

    df = pd_query(query.format(xid, start, end))
    return df

示例#12

0

显示文件

文件： predict.py 项目： jngseattle/BorderCrossing

def get_actual(start, location, direction, lane):
    xid, start, end = get_chart_params(start, location, direction, lane)

    query = '''
            select
                d.date,
                waittime
            from datefeatures d
            left join crossingdata c
                on d.date = c.date
                and crossing_id = {0}
            where
                d.date >= '{1}' and d.date < '{2}'
                and (minute = 0 or minute=30)
            order by date
            '''

    df = pd_query(query.format(xid, start, end))
    return df

示例#13

0

显示文件

文件： BorderQuery.py 项目： jngseattle/BorderCrossing

def select_features_simple(start_date, end_date):
    '''
    Select date features only
    '''
    query = '''
            select
                d.date,
                year,
                month,
                week,
                dayofweek,
                minofday
            from datefeatures d
            where
                d.date >= '{0}' and d.date < '{1}'
                and (minute = 0 or minute = 30)
            order by d.date;
            '''

    return pd_query(query.format(start_date, end_date)).set_index('date')

示例#14

0

显示文件

def select_features_simple(start_date, end_date):
    '''
    Select date features only
    '''
    query = '''
            select
                d.date,
                year,
                month,
                week,
                dayofweek,
                minofday
            from datefeatures d
            where
                d.date >= '{0}' and d.date < '{1}'
                and (minute = 0 or minute = 30)
            order by d.date;
            '''

    return pd_query(query.format(start_date, end_date)).set_index('date')

示例#15

0

显示文件

文件： BorderQuery.py 项目： jngseattle/BorderCrossing

def select_predictions(munger_id, model_version, crossing_id,
                       start_date, end_date):
    '''
    Select date features only
    '''
    query = '''
            select
                date,
                waittime
            from predictions
            where
                munger_id = {0}
                and model_version = '{1}'
                and crossing_id = {2}
                and date >= '{3}' and date < '{4}'
            order by date;
            '''

    return pd_query(query.format(munger_id, model_version, crossing_id,
                                 start_date, end_date)).set_index('date')

示例#16

0

显示文件

def select_crossingdata(crossing_id, start_date):
    query = '''
            select
                c.date,
                waittime,
                volume,
                year,
                month,
                week,
                dayofweek,
                minofday
            from crossingdata c
            join datefeatures d on c.date = d.date
        where
            crossing_id = {0}
            and (minute = 0 or minute = 30)
            and c.date >= '{1}'
        order by c.date;
        '''

    return pd_query(query.format(crossing_id, start_date)).set_index('date')

示例#17

0

显示文件

def select_predictions(munger_id, model_version, crossing_id, start_date,
                       end_date):
    '''
    Select date features only
    '''
    query = '''
            select
                date,
                waittime
            from predictions
            where
                munger_id = {0}
                and model_version = '{1}'
                and crossing_id = {2}
                and date >= '{3}' and date < '{4}'
            order by date;
            '''

    return pd_query(
        query.format(munger_id, model_version, crossing_id, start_date,
                     end_date)).set_index('date')

示例#18

0

显示文件

文件： BorderQuery.py 项目： jngseattle/BorderCrossing

def select_crossingdata(crossing_id, start_date):
    query = '''
            select
                c.date,
                waittime,
                volume,
                year,
                month,
                week,
                dayofweek,
                minofday
            from crossingdata c
            join datefeatures d on c.date = d.date
        where
            crossing_id = {0}
            and (minute = 0 or minute = 30)
            and c.date >= '{1}'
        order by c.date;
        '''

    return pd_query(query.format(crossing_id, start_date)).set_index('date')

示例#19

0

显示文件

文件： BorderModel.py 项目： jngseattle/BorderCrossing

def rolling_volume_aggregate(days, percent=.5):
    '''
    Returns a dataframe with date and multiple rolling_means of the average
    aggregrate volume inbalance between north and south crossings

    IN
        days: list of days for rolling_means
        percent: percentage of values that can be missing for rolling_mean to
                 not return NA
    OUT
        dataframe of form: date, vol_mean_1, vol_mean_2, etc.
                           for each day in days; nulls excluded from output
    '''
    series = pd_query('select date, volume from dailyvolume order by date')\
        .set_index('date').volume
    df = pd.DataFrame()

    for day in days:
        df['vol_mean_{0}'.format(day)] = \
            pd.rolling_mean(series, day, min_periods=day * percent).shift(1)

    df.index = pd.to_datetime(df.index)

    return df.dropna()

示例#20

0

显示文件

文件： BorderModel.py 项目： jngseattle/BorderCrossing

def rolling_volume_aggregate(days, percent=.5):
    '''
    Returns a dataframe with date and multiple rolling_means of the average
    aggregrate volume inbalance between north and south crossings

    IN
        days: list of days for rolling_means
        percent: percentage of values that can be missing for rolling_mean to
                 not return NA
    OUT
        dataframe of form: date, vol_mean_1, vol_mean_2, etc.
                           for each day in days; nulls excluded from output
    '''
    series = pd_query('select date, volume from dailyvolume order by date')\
        .set_index('date').volume
    df = pd.DataFrame()

    for day in days:
        df['vol_mean_{0}'.format(day)] = \
            pd.rolling_mean(series, day, min_periods=day * percent).shift(1)

    df.index = pd.to_datetime(df.index)

    return df.dropna()

示例#21

0

显示文件

文件： BorderQuery.py 项目： jngseattle/BorderCrossing

def select_mungedata(munger_id, crossing_id, start_date, end_date):
    query = '''
            select
                d.date,
                metric as waittime,
                year,
                month,
                week,
                dayofweek,
                minofday,
                w.temp_max,
                w.temp_mean,
                w.temp_min,
                w.viz_max,
                w.wind_max,
                w.precip,
                w.rain,
                w.snow,
                w.fog,
                w.thunderstorm,
                wp1.temp_max as temp_max_p1,
                wp1.temp_mean as temp_mean_p1,
                wp1.temp_min as temp_min_p1,
                wp1.precip as precip_p1,
                wp1.rain as rain_p1,
                wp1.snow as snow_p1,
                wp1.thunderstorm as thunderstorm_p1,
                wp2.temp_max as temp_max_p2,
                wp2.temp_mean as temp_mean_p2,
                wp2.temp_min as temp_min_p2,
                wp2.precip as precip_p2,
                wp2.rain as rain_p2,
                wp2.snow as snow_p2,
                wp2.thunderstorm as thunderstorm_p2,
                wp3.temp_max as temp_max_p3,
                wp3.temp_mean as temp_mean_p3,
                wp3.temp_min as temp_min_p3,
                wp3.precip as precip_p3,
                wp3.rain as rain_p3,
                wp3.snow as snow_p3,
                wp3.thunderstorm as thunderstorm_p3,
                wm1.temp_max as temp_max_m1,
                wm1.temp_mean as temp_mean_m1,
                wm1.temp_min as temp_min_m1,
                wm1.precip as precip_m1,
                wm1.rain as rain_m1,
                wm1.snow as snow_m1,
                wm1.thunderstorm as thunderstorm_m1,
                wm2.temp_max as temp_max_m2,
                wm2.temp_mean as temp_mean_m2,
                wm2.temp_min as temp_min_m2,
                wm2.precip as precip_m2,
                wm2.rain as rain_m2,
                wm2.snow as snow_m2,
                wm2.thunderstorm as thunderstorm_m2,
                s.event,
                s_lead1.event as event_lead1,
                s_lag1.event as event_lag1,
                s_lead2.event as event_lead2,
                s_lag2.event as event_lag2,
                s_lead3.event as event_lead3,
                s_lag3.event as event_lag3,
                s_lead4.event as event_lead4,
                s_lag4.event as event_lag4
            from datefeatures d
            left join mungedata m on m.date = d.date
                and crossing_id = {0}
                and munger_id = {1}
                and is_waittime = true
            left join publicholiday h on m.date::timestamp::date = h.date
            left join weather w on m.date::timestamp::date = w.date
            left join weather wp1 on m.date::timestamp::date =
                wp1.date - interval '1 day'
            left join weather wp2 on m.date::timestamp::date =
                wp2.date - interval '2 day'
            left join weather wp3 on m.date::timestamp::date =
                wp3.date - interval '3 day'
            left join weather wm1 on m.date::timestamp::date =
                wm1.date + interval '1 day'
            left join weather wm2 on m.date::timestamp::date =
                wm2.date + interval '2 day'
            left join specialdates s on m.date::timestamp::date = s.date
            left join specialdates s_lead1 on m.date::timestamp::date =
                s_lead1.date - interval '1 day'
            left join specialdates s_lag1 on m.date::timestamp::date =
                s_lag1.date + interval '1 day'
            left join specialdates s_lead2 on m.date::timestamp::date =
                s_lead2.date - interval '2 day'
            left join specialdates s_lag2 on m.date::timestamp::date =
                s_lag2.date + interval '2 day'
            left join specialdates s_lead3 on m.date::timestamp::date =
                s_lead3.date - interval '3 day'
            left join specialdates s_lag3 on m.date::timestamp::date =
                s_lag3.date + interval '3 day'
            left join specialdates s_lead4 on m.date::timestamp::date =
                s_lead4.date - interval '4 day'
            left join specialdates s_lag4 on m.date::timestamp::date =
                s_lag4.date + interval '4 day'
        where
            (minute = 0 or minute = 30)
            and d.date >= '{2}'
            and d.date < '{3}'
        order by d.date;
        '''

    return pd_query(query.format(crossing_id,
                                 munger_id,
                                 start_date, end_date)).set_index('date')

示例#22

0

显示文件

def select_mungedata(munger_id, crossing_id, start_date, end_date):
    query = '''
            select
                d.date,
                metric as waittime,
                year,
                month,
                week,
                dayofweek,
                minofday,
                w.temp_max,
                w.temp_mean,
                w.temp_min,
                w.viz_max,
                w.wind_max,
                w.precip,
                w.rain,
                w.snow,
                w.fog,
                w.thunderstorm,
                wp1.temp_max as temp_max_p1,
                wp1.temp_mean as temp_mean_p1,
                wp1.temp_min as temp_min_p1,
                wp1.precip as precip_p1,
                wp1.rain as rain_p1,
                wp1.snow as snow_p1,
                wp1.thunderstorm as thunderstorm_p1,
                wp2.temp_max as temp_max_p2,
                wp2.temp_mean as temp_mean_p2,
                wp2.temp_min as temp_min_p2,
                wp2.precip as precip_p2,
                wp2.rain as rain_p2,
                wp2.snow as snow_p2,
                wp2.thunderstorm as thunderstorm_p2,
                wp3.temp_max as temp_max_p3,
                wp3.temp_mean as temp_mean_p3,
                wp3.temp_min as temp_min_p3,
                wp3.precip as precip_p3,
                wp3.rain as rain_p3,
                wp3.snow as snow_p3,
                wp3.thunderstorm as thunderstorm_p3,
                wm1.temp_max as temp_max_m1,
                wm1.temp_mean as temp_mean_m1,
                wm1.temp_min as temp_min_m1,
                wm1.precip as precip_m1,
                wm1.rain as rain_m1,
                wm1.snow as snow_m1,
                wm1.thunderstorm as thunderstorm_m1,
                wm2.temp_max as temp_max_m2,
                wm2.temp_mean as temp_mean_m2,
                wm2.temp_min as temp_min_m2,
                wm2.precip as precip_m2,
                wm2.rain as rain_m2,
                wm2.snow as snow_m2,
                wm2.thunderstorm as thunderstorm_m2,
                s.event,
                s_lead1.event as event_lead1,
                s_lag1.event as event_lag1,
                s_lead2.event as event_lead2,
                s_lag2.event as event_lag2,
                s_lead3.event as event_lead3,
                s_lag3.event as event_lag3,
                s_lead4.event as event_lead4,
                s_lag4.event as event_lag4
            from datefeatures d
            left join mungedata m on m.date = d.date
                and crossing_id = {0}
                and munger_id = {1}
                and is_waittime = true
            left join publicholiday h on m.date::timestamp::date = h.date
            left join weather w on m.date::timestamp::date = w.date
            left join weather wp1 on m.date::timestamp::date =
                wp1.date - interval '1 day'
            left join weather wp2 on m.date::timestamp::date =
                wp2.date - interval '2 day'
            left join weather wp3 on m.date::timestamp::date =
                wp3.date - interval '3 day'
            left join weather wm1 on m.date::timestamp::date =
                wm1.date + interval '1 day'
            left join weather wm2 on m.date::timestamp::date =
                wm2.date + interval '2 day'
            left join specialdates s on m.date::timestamp::date = s.date
            left join specialdates s_lead1 on m.date::timestamp::date =
                s_lead1.date - interval '1 day'
            left join specialdates s_lag1 on m.date::timestamp::date =
                s_lag1.date + interval '1 day'
            left join specialdates s_lead2 on m.date::timestamp::date =
                s_lead2.date - interval '2 day'
            left join specialdates s_lag2 on m.date::timestamp::date =
                s_lag2.date + interval '2 day'
            left join specialdates s_lead3 on m.date::timestamp::date =
                s_lead3.date - interval '3 day'
            left join specialdates s_lag3 on m.date::timestamp::date =
                s_lag3.date + interval '3 day'
            left join specialdates s_lead4 on m.date::timestamp::date =
                s_lead4.date - interval '4 day'
            left join specialdates s_lag4 on m.date::timestamp::date =
                s_lag4.date + interval '4 day'
        where
            (minute = 0 or minute = 30)
            and d.date >= '{2}'
            and d.date < '{3}'
        order by d.date;
        '''

    return pd_query(query.format(crossing_id, munger_id, start_date,
                                 end_date)).set_index('date')