Пример #1
0
def create_tmp_train(year: int):
    print(f'starting tmp_train for year {year}')

    execute_sql(f'drop table if exists tmp_train_{year}')

    sql = f"""
    create unlogged table tmp_train_{year} as
    select
    se.numeric_id
    , se.date
    , se.wday
    , se.store_id
    , se.dept_id
    , se.state_id
    , si.snap_status
    , si.days_since_snap
    , wa.relative_median
    , pc.price_change_w1
    , pc.price_change_w3
    ,gp.state_average_last_7
    ,gp.state_average_last_21
    from sales_ext as se
    inner join price_changes as pc 
        on se.store_id = pc.store_id and se.item_id = pc.item_id and se.wm_yr_wk = pc.wm_yr_wk
    inner join snap_info as si on se.state_id = si.state_id and si.date = se.date
    inner join weekday_average as wa
        on se.wday = wa.wday and wa.dept_id = se.dept_id and wa.store_id = se.store_id
    inner join snap_influence as sinf
        on se.dept_id = sinf.dept_id and se.store_id = sinf.store_id
            and sinf.snap_status = si.snap_status
    inner join grouped_lags as gp on se.state_id = gp.state_id and se.date = gp.date and se.item_id = gp.item_id
    where se.date between '{year}-01-01' and '{year}-12-31'
    """

    execute_sql(sql)
Пример #2
0
def import_item_info():
    start = time.time()

    df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv')
    df = df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]
    df.drop_duplicates(inplace=True)

    execute_sql('drop table if exists item_info')
    execute_sql_from_file('item_info')

    copy_from_options = {'columns': ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']}
    dataframe_to_table_bulk(df,
                            'item_info',
                            copy_from_options=copy_from_options)

    end = time.time()
    print(f'import_sales took {round(end - start)} seconds')
Пример #3
0
def import_sell_prices():
    start = time.time()

    execute_sql('drop table if exists prices')
    execute_sql_from_file('prices')

    conn = get_connection()
    cur = conn.cursor()

    with open(raw_data_folder() / 'sell_prices.csv', 'r') as f:
        f.readline()  # don't read the column names as a row in copy_from
        cur.copy_from(file=f, table='prices', sep=',')

    cur.close()
    conn.close()

    end = time.time()
    print(f'import_sell_prices took {round(end - start)} seconds')
Пример #4
0
def write_validation_results_to_db(model,
                                   model_name: str,
                                   params: str,
                                   train_size=np.nan,
                                   validation_size=VALIDATION_SIZE,
                                   numeric_only=False):

    execute_sql_from_file('validation')
    sql = f"DELETE FROM validation where model_name = '{model_name}'"
    execute_sql(sql)

    [val_x, val_y, ids] = collect_features(data_set='validation',
                                           size=validation_size,
                                           numeric_only=numeric_only)
    pred_values = model.predict(val_x)
    predictions = pd.DataFrame(
        data={
            'numeric_id': ids['numeric_id'],
            'predicted': pred_values,
            'date': ids['date'],
            'target': val_y,
            'cum_mse': ids['cum_mse']
        })
    predictions['model_name'] = model_name

    dataframe_to_table_bulk(df=predictions[[
        'model_name', 'date', 'numeric_id', 'predicted', 'cum_mse', 'target'
    ]],
                            table='validation')

    execute_sql_from_file('model_info')
    model_info = pd.DataFrame(
        data={
            'model_name': [model_name],
            'rmse': [get_rmse(model_name)],
            'created_at': [f"{datetime.now():%Y-%m-%d %H:%M}"],
            'params': [str(params)],
            'train_size': [train_size],
            'features': [", ".join(list(val_x))],
            'git_commit': [get_git_commit()]
        })
    dataframe_to_table(model_info, table='model_info')
Пример #5
0
def create_train_year(year: int):
    print(f'starting train for year {year}')
    execute_sql(f'drop table if exists train_{year}')

    execute_sql(f"""
    create unlogged table train_{year} partition of train
    for values from ('{year}-01-01') to ('{year+1}-01-01') 
    """)

    sql = f"""
    insert into train_{year} 
    select
        tmp_train.numeric_id
        ,tmp_train.date
        ,tmp_train.wday
        ,tmp_train.store_id
        ,tmp_train.dept_id
        ,tmp_train.state_id
        ,tmp_train.snap_status
        ,tmp_train.days_since_snap
        ,lags.avg_last_1
        ,lags.avg_last_3
        ,lags.avg_last_7
        ,lags.avg_last_21
        ,lags.max_last_21
        ,lags.std_last_21
        ,lags.max_last_42
        ,lags.avg_last_42
        ,tmp_train.relative_median
        ,avg_last_7 * tmp_train.relative_median as wa_adjusted_quantity_last_7
        ,tmp_train.relative_median as sinf_relative_median
        ,avg_last_7 * tmp_train.relative_median as sinf_adjusted_quantity_last_7
        ,tmp_train.price_change_w1
        ,tmp_train.price_change_w3
        ,state_average_last_7
        ,state_average_last_21
    ,weekday_last_1
    ,weekday_last_2
    ,weekday_last_8
        ,lags.target
        ,cum_errors.cum_mse
    from tmp_train_{year} as tmp_train
    inner join lags on tmp_train.date = lags.date and tmp_train.numeric_id = lags.numeric_id
    inner join weekday_lag on tmp_train.date = weekday_lag.date and tmp_train.numeric_id = weekday_lag.numeric_id
    inner join cum_errors on tmp_train.date = cum_errors.date and tmp_train.numeric_id = cum_errors.numeric_id
    """

    execute_sql(sql)

    execute_sql(f"create INDEX date_{year}_idx ON train_{year} (date)")
Пример #6
0
def define_train():

    execute_sql('drop table if exists train')

    sql = """
    CREATE UNLOGGED TABLE train (
        numeric_id int
        ,date date
        ,wday smallint
        ,store_id text
        ,dept_id text
        ,state_id text
        ,snap_status boolean
        ,days_since_snap real
        ,avg_last_1 real
        ,avg_last_3 real
        ,avg_last_7 real
        ,avg_last_21 real
        ,max_last_21 real
        ,std_last_21 smallint
        ,max_last_42 real
        ,avg_last_42 smallint
        ,relative_median real
        ,wa_adjusted_quantity_last_7 real
        ,sinf_relative_median real
        ,sinf_adjusted_quantity_last_7 real
        ,price_change_w1 numeric
        ,price_change_w3 numeric
        ,state_average_last_7 real
        ,state_average_last_21 real
    ,weekday_last_1 real
    ,weekday_last_2 real
    ,weekday_last_8 real
        ,target smallint
        ,cum_mse real
    ) PARTITION BY RANGE (date);
    """

    execute_sql(sql)
Пример #7
0
def import_sales():
    start = time.time()

    df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv')
    df.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True, axis='columns')
    df2 = pd.melt(df, id_vars=['id'])

    execute_sql('drop view if exists sales_ext')
    execute_sql('drop table if exists sales_raw')
    execute_sql('drop table if exists sales')
    execute_sql_from_file('sales_raw')
    dataframe_to_table_bulk(df2, 'sales_raw')

    execute_sql_from_file('sales')
    execute_sql('drop table sales_raw')

    end = time.time()
    print(f'import_sales took {round(end - start)} seconds')
Пример #8
0
,train.weekday
,SQRT(AVG(POWER((train.target - validation.predicted),2))) as rmse
,avg(train.target - validation.predicted) as Error
from
train
inner join validation on train.date = validation.date and validation.id = train.id
where validation.model_name = 'LGBM_1200'
group by 1,2,3,4,5
order by 4,2 desc"""

df_long = dataframe_from_sql(sql)

#%%

from generics.postgres import execute_sql
calendar = execute_sql('delete from model_info')

#%%
from generics.postgres import dataframe_from_sql

sql = """
select 
validation.model_name
,calendar.weekday
,SQRT(AVG(POWER((target - validation.predicted),2))) as rmse
,avg(target - predicted) as Error
from
validation
inner join calendar on validation.date = calendar.date
group by 1,2
order by 1 desc"""
Пример #9
0
def drop_tmp_train(year):
    print(f'dropping tmp_train for year {year}')
    execute_sql(f'drop table tmp_train_{year}')
Пример #10
0
def import_calendar():
    df = pd.read_csv(raw_data_folder() / 'calendar.csv', parse_dates=['date'])
    execute_sql('drop table if exists calendar')
    execute_sql_from_file('calendar')
    dataframe_to_table_bulk(df, 'calendar')
Пример #11
0
def init_db():
    execute_sql("create database optuna")
Пример #12
0
import optuna
from catboost import CatBoostRegressor

from generics.postgres import dataframe_from_sql, execute_sql, execute_sql_from_file, create_sa_string

7
from datetime import datetime
from model_utilities import collect_features, write_validation_results_to_db, eval_lgbm_model

execute_sql_from_file('validation')
sql = "DELETE FROM validation where model_name = 'CatBoost'"
execute_sql(sql)

train_size = 800000

[test_x, test_y, ids] = collect_features(data_set='test',
                                         size=100000,
                                         numeric_only=False)
[x, y, ids] = collect_features(data_set='train',
                               size=train_size,
                               numeric_only=False)

params = dict(
    cat_features=['wday', 'dept_id', 'state_id', 'store_id', 'snap_status'],
    loss_function='RMSE',
    learning_rate=0.002,
    iterations=5000,
    random_strength=2,
    min_data_in_leaf=20)

model = CatBoostRegressor(verbose=True, **params)