def create_tmp_train(year: int): print(f'starting tmp_train for year {year}') execute_sql(f'drop table if exists tmp_train_{year}') sql = f""" create unlogged table tmp_train_{year} as select se.numeric_id , se.date , se.wday , se.store_id , se.dept_id , se.state_id , si.snap_status , si.days_since_snap , wa.relative_median , pc.price_change_w1 , pc.price_change_w3 ,gp.state_average_last_7 ,gp.state_average_last_21 from sales_ext as se inner join price_changes as pc on se.store_id = pc.store_id and se.item_id = pc.item_id and se.wm_yr_wk = pc.wm_yr_wk inner join snap_info as si on se.state_id = si.state_id and si.date = se.date inner join weekday_average as wa on se.wday = wa.wday and wa.dept_id = se.dept_id and wa.store_id = se.store_id inner join snap_influence as sinf on se.dept_id = sinf.dept_id and se.store_id = sinf.store_id and sinf.snap_status = si.snap_status inner join grouped_lags as gp on se.state_id = gp.state_id and se.date = gp.date and se.item_id = gp.item_id where se.date between '{year}-01-01' and '{year}-12-31' """ execute_sql(sql)
def import_item_info(): start = time.time() df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv') df = df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']] df.drop_duplicates(inplace=True) execute_sql('drop table if exists item_info') execute_sql_from_file('item_info') copy_from_options = {'columns': ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']} dataframe_to_table_bulk(df, 'item_info', copy_from_options=copy_from_options) end = time.time() print(f'import_sales took {round(end - start)} seconds')
def import_sell_prices(): start = time.time() execute_sql('drop table if exists prices') execute_sql_from_file('prices') conn = get_connection() cur = conn.cursor() with open(raw_data_folder() / 'sell_prices.csv', 'r') as f: f.readline() # don't read the column names as a row in copy_from cur.copy_from(file=f, table='prices', sep=',') cur.close() conn.close() end = time.time() print(f'import_sell_prices took {round(end - start)} seconds')
def write_validation_results_to_db(model, model_name: str, params: str, train_size=np.nan, validation_size=VALIDATION_SIZE, numeric_only=False): execute_sql_from_file('validation') sql = f"DELETE FROM validation where model_name = '{model_name}'" execute_sql(sql) [val_x, val_y, ids] = collect_features(data_set='validation', size=validation_size, numeric_only=numeric_only) pred_values = model.predict(val_x) predictions = pd.DataFrame( data={ 'numeric_id': ids['numeric_id'], 'predicted': pred_values, 'date': ids['date'], 'target': val_y, 'cum_mse': ids['cum_mse'] }) predictions['model_name'] = model_name dataframe_to_table_bulk(df=predictions[[ 'model_name', 'date', 'numeric_id', 'predicted', 'cum_mse', 'target' ]], table='validation') execute_sql_from_file('model_info') model_info = pd.DataFrame( data={ 'model_name': [model_name], 'rmse': [get_rmse(model_name)], 'created_at': [f"{datetime.now():%Y-%m-%d %H:%M}"], 'params': [str(params)], 'train_size': [train_size], 'features': [", ".join(list(val_x))], 'git_commit': [get_git_commit()] }) dataframe_to_table(model_info, table='model_info')
def create_train_year(year: int): print(f'starting train for year {year}') execute_sql(f'drop table if exists train_{year}') execute_sql(f""" create unlogged table train_{year} partition of train for values from ('{year}-01-01') to ('{year+1}-01-01') """) sql = f""" insert into train_{year} select tmp_train.numeric_id ,tmp_train.date ,tmp_train.wday ,tmp_train.store_id ,tmp_train.dept_id ,tmp_train.state_id ,tmp_train.snap_status ,tmp_train.days_since_snap ,lags.avg_last_1 ,lags.avg_last_3 ,lags.avg_last_7 ,lags.avg_last_21 ,lags.max_last_21 ,lags.std_last_21 ,lags.max_last_42 ,lags.avg_last_42 ,tmp_train.relative_median ,avg_last_7 * tmp_train.relative_median as wa_adjusted_quantity_last_7 ,tmp_train.relative_median as sinf_relative_median ,avg_last_7 * tmp_train.relative_median as sinf_adjusted_quantity_last_7 ,tmp_train.price_change_w1 ,tmp_train.price_change_w3 ,state_average_last_7 ,state_average_last_21 ,weekday_last_1 ,weekday_last_2 ,weekday_last_8 ,lags.target ,cum_errors.cum_mse from tmp_train_{year} as tmp_train inner join lags on tmp_train.date = lags.date and tmp_train.numeric_id = lags.numeric_id inner join weekday_lag on tmp_train.date = weekday_lag.date and tmp_train.numeric_id = weekday_lag.numeric_id inner join cum_errors on tmp_train.date = cum_errors.date and tmp_train.numeric_id = cum_errors.numeric_id """ execute_sql(sql) execute_sql(f"create INDEX date_{year}_idx ON train_{year} (date)")
def define_train(): execute_sql('drop table if exists train') sql = """ CREATE UNLOGGED TABLE train ( numeric_id int ,date date ,wday smallint ,store_id text ,dept_id text ,state_id text ,snap_status boolean ,days_since_snap real ,avg_last_1 real ,avg_last_3 real ,avg_last_7 real ,avg_last_21 real ,max_last_21 real ,std_last_21 smallint ,max_last_42 real ,avg_last_42 smallint ,relative_median real ,wa_adjusted_quantity_last_7 real ,sinf_relative_median real ,sinf_adjusted_quantity_last_7 real ,price_change_w1 numeric ,price_change_w3 numeric ,state_average_last_7 real ,state_average_last_21 real ,weekday_last_1 real ,weekday_last_2 real ,weekday_last_8 real ,target smallint ,cum_mse real ) PARTITION BY RANGE (date); """ execute_sql(sql)
def import_sales(): start = time.time() df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv') df.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True, axis='columns') df2 = pd.melt(df, id_vars=['id']) execute_sql('drop view if exists sales_ext') execute_sql('drop table if exists sales_raw') execute_sql('drop table if exists sales') execute_sql_from_file('sales_raw') dataframe_to_table_bulk(df2, 'sales_raw') execute_sql_from_file('sales') execute_sql('drop table sales_raw') end = time.time() print(f'import_sales took {round(end - start)} seconds')
,train.weekday ,SQRT(AVG(POWER((train.target - validation.predicted),2))) as rmse ,avg(train.target - validation.predicted) as Error from train inner join validation on train.date = validation.date and validation.id = train.id where validation.model_name = 'LGBM_1200' group by 1,2,3,4,5 order by 4,2 desc""" df_long = dataframe_from_sql(sql) #%% from generics.postgres import execute_sql calendar = execute_sql('delete from model_info') #%% from generics.postgres import dataframe_from_sql sql = """ select validation.model_name ,calendar.weekday ,SQRT(AVG(POWER((target - validation.predicted),2))) as rmse ,avg(target - predicted) as Error from validation inner join calendar on validation.date = calendar.date group by 1,2 order by 1 desc"""
def drop_tmp_train(year): print(f'dropping tmp_train for year {year}') execute_sql(f'drop table tmp_train_{year}')
def import_calendar(): df = pd.read_csv(raw_data_folder() / 'calendar.csv', parse_dates=['date']) execute_sql('drop table if exists calendar') execute_sql_from_file('calendar') dataframe_to_table_bulk(df, 'calendar')
def init_db(): execute_sql("create database optuna")
import optuna from catboost import CatBoostRegressor from generics.postgres import dataframe_from_sql, execute_sql, execute_sql_from_file, create_sa_string 7 from datetime import datetime from model_utilities import collect_features, write_validation_results_to_db, eval_lgbm_model execute_sql_from_file('validation') sql = "DELETE FROM validation where model_name = 'CatBoost'" execute_sql(sql) train_size = 800000 [test_x, test_y, ids] = collect_features(data_set='test', size=100000, numeric_only=False) [x, y, ids] = collect_features(data_set='train', size=train_size, numeric_only=False) params = dict( cat_features=['wday', 'dept_id', 'state_id', 'store_id', 'snap_status'], loss_function='RMSE', learning_rate=0.002, iterations=5000, random_strength=2, min_data_in_leaf=20) model = CatBoostRegressor(verbose=True, **params)