def __init__(self, sales): global LAGS print("start process_data...") df_head = sales.head() if not PROC_CSV_EXIST: sales["x"] = sales["demand"] / sales["scale1"] self.FEATS = [] for lag in tqdm(LAGS): sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag) self.FEATS.append(f"x_{lag}") print(sales.shape) sales = sales.loc[sales.nb > sales.start] print(sales.shape) nb = sales['nb'].values MAX_LAG = max(LAGS) # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT self.tr_mask = np.logical_and(nb > START + MAX_LAG, nb <= 1941) self.val_mask = np.logical_and(nb > 1913, nb <= 1941) self.te_mask = np.logical_and(nb > 1941, nb <= 1969) print('processing csv file...') # def preprocess_sales_2(sales): # months_unq = sales['month'].unique().tolist() # years_unq = sales['year'].unique().tolist() # weeks_unq = sales['wday'].unique().tolist() # # sales = sales.dropna(axis=0, subset=['x_28']) # for i in years_unq: # for y in months_unq: # sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_mean'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform( # lambda x: x.mean()).astype("float32") # sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_max'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform( # lambda x: x.max()).astype("float32") # sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_min'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform( # lambda x: x.min()).astype("float32") # sales['x_28' + '_month_max_to_min_diff'] = ( # sales['x_28' + '_month_max'] - sales['x_28' + '_month_min']).astype("float32") # # for z in weeks_unq: # sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( # sales['wday'] == z), 'x_28' + '_wk_mean'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( # ['id'])[ # 'x_28'].transform(lambda x: x.mean()).astype("float32") # sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( # sales['wday'] == z), 'x_28' + '_wk_median'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( # ['id'])[ # 'x_28'].transform(lambda x: x.median()).astype("float32") # sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( # sales['wday'] == z), 'x_28' + '_wk_max'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( # ['id'])[ # 'x_28'].transform(lambda x: x.max()).astype("float32") # sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( # sales['wday'] == z), 'x_28' + '_wk_min'] = \ # sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( # ['id'])[ # 'x_28'].transform(lambda x: x.min()).astype("float32") # sales['x_28' + '_wk_max_to_min_diff'] = ( # sales['x_28' + '_wk_max'] - sales['x_28' + '_wk_min']).astype("float32") # return sales # # sales = preprocess_sales_2(sales) # sales_f9 = pd.read_csv("../input/sales_processed_f9.csv", index_col=0) # # def preprocess_sales_3(df): # months_unq = df['month'].unique().tolist() # years_unq = df['year'].unique().tolist() # weeks_unq = df['wday'].unique().tolist() # # for i in years_unq: # for y in months_unq: # df.loc[(df['year'] == i) & (df['month'] == y), 'x_28' + '_month_var'] = \ # df.loc[(df['month'] == y) & (df['year'] == i)].groupby(['id'])[ # 'x_28'].transform(lambda x: x.var()).astype("float32") # for z in weeks_unq: # df.loc[(df['year'] == i) & (df['month'] == y) & ( # df['wday'] == z), 'x_28' + '_wk_var'] = df.loc[ # (df['month'] == y) & (df['year'] == i) & (df['wday'] == z)].groupby( # ['id'])['x_28'].transform(lambda x: x.var()).astype("float32") # return df # # sales_f9 = preprocess_sales_3(sales_f9) sales_f11 = pd.read_csv("../input/sales_processed_f9.csv", index_col=0) def preprocess_sales_3(df): print('In process 3') months_unq = df['month'].unique().tolist() years_unq = df['year'].unique().tolist() for i in years_unq: for y in months_unq: df.loc[(df['year'] == i) & (df['month'] == y), 'x_28' + '_month_median'] = \ df.loc[(df['month'] == y) & (df['year'] == i)].groupby(['id'])[ 'x_28'].transform(lambda x: x.median()).astype("float32") return df sales_f11 = preprocess_sales_3(sales_f11) # sales.to_csv("../input/sales_processed_f9.csv", index=True) # sales[LIST_OF_FEATURE].to_csv("../input/sales_processed_only_f9.csv", index=True) # sales_f9.to_csv("../input/sales_processed_f11.csv", index=True) sales_f11.to_csv("../input/sales_processed_f10.csv", index=True) print('csv file processed.') exit() # sales_f15 = pd.read_csv("../input/sales_processed_f15.csv", index_col=0) # sales_f15 = sales_f15.drop('x_28_month_var', axis=1) # sales_f15.to_csv("../input/sales_processed_f15.csv", index=True) else: print('Reading csv file...') sales = pd.read_csv("../input/sales_processed_f10.csv", index_col=0) sales = sales.loc[:, ~sales.columns.isin( ['x_28', 'x_30', 'x_35', 'x_42', 'x_49', 'x_56', 'x_63'])] # sales = pd.read_csv("../input/sales_processed.csv", index_col=0) sales = LoadData.reduce_mem_usage(sales) gc.collect() print('Csv file opened.') df_head_proc = sales.head() # sales['CA_w'] = sales.loc[:]['snap_CA'] * sales.loc[:]['x_28_wk_mean'] # sales['TX_w'] = sales.loc[:]['snap_TX'] * sales.loc[:]['x_28_wk_mean'] # sales['WI_w'] = sales.loc[:]['snap_WI'] * sales.loc[:]['x_28_wk_mean'] self.FEATS = [] for lag in tqdm(LAGS): sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag) self.FEATS.append(f"x_{lag}") print(sales.shape) sales = sales.loc[sales.nb > sales.start] print(sales.shape) nb = sales['nb'].values MAX_LAG = max(LAGS) # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT self.tr_mask = np.logical_and(nb > START + MAX_LAG, nb <= 1941) self.val_mask = np.logical_and(nb > 1913, nb <= 1941) self.te_mask = np.logical_and(nb > 1941, nb <= 1969) # print('#' * 40) # print("SALES:", sales.isnull().any()) # self.scale2 = sales['scale1'].values self.scale = sales['scale1'].values self.ids = sales['id'].values # y = sales['demand'].values # ys = y / scale # self.ys = sales[['x', 'sales1']].values self.ys = sales['x'].values # feats_list = self.FEATS + LIST_OF_FEATURE self.feats_list = self.FEATS self.Z = sales[self.feats_list].values # self.Z = sales[self.FEATS].values.reshape((NITEMS, -1, len(self.FEATS))) print(self.scale.shape, self.ids.shape, self.ys.shape, self.Z.shape) self.sv = self.scale[self.val_mask] self.se = self.scale[self.te_mask] self.ids = self.ids[self.te_mask] self.ids = self.ids.reshape((-1, 28)) self.ca = sales[['snap_CA']].values self.tx = sales[['snap_TX']].values self.wi = sales[['snap_WI']].values self.wday = sales[['wday']].values self.month = sales[['month']].values self.year = sales[['year']].values self.event = sales[['event_name']].values self.nday = sales[['nday']].values self.item = sales[['item_id']].values self.dept = sales[['dept_id']].values self.cat = sales[['cat_id']].values self.store = sales[['store_id']].values self.state = sales[['state_id']].values self.x_28_month_mean = sales[['x_28_month_mean']].values self.x_28_month_median = sales[['x_28_month_median']].values self.x_28_month_max = sales[['x_28_month_max']].values self.x_28_month_min = sales[['x_28_month_min']].values self.x_28_month_max_to_min_diff = sales[['x_28_month_max_to_min_diff' ]].values self.x_28_wk_mean = sales[['x_28_wk_mean']].values self.x_28_wk_median = sales[['x_28_wk_median']].values self.x_28_wk_max = sales[['x_28_wk_max']].values self.x_28_wk_min = sales[['x_28_wk_min']].values self.x_28_wk_max_to_min_diff = sales[['x_28_wk_max_to_min_diff' ]].values
def __init__(self, sales): global LAGS print("start process_data...") df_head = sales.head() if not PROC_CSV_EXIST: sales["x"] = sales["demand"] / sales["scale1"] self.FEATS = [] for lag in tqdm(LAGS): sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag) self.FEATS.append(f"x_{lag}") # print(sales.shape) # sales = sales.loc[sales.nb > sales.start] # print(sales.shape) # nb = sales['nb'].values # MAX_LAG = max(LAGS) # tr_mask = np.logical_and(nb>START + MAX_LAG, nb<=1913) # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT # self.tr_mask = np.logical_and(nb > START + MAX_LAG, nb <= 1941) # self.val_mask = np.logical_and(nb > 1913, nb <= 1941) # self.te_mask = np.logical_and(nb > 1941, nb <= 1969) def preprocess_sales_2(sales): months_unq = sales['month'].unique().tolist() years_unq = sales['year'].unique().tolist() weeks_unq = sales['wday'].unique().tolist() # sales = sales.dropna(axis=0, subset=['x_28']) for i in years_unq: for y in months_unq: sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_mean'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform( lambda x: x.mean()).astype("float32") sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_max'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform( lambda x: x.max()).astype("float32") sales.loc[(sales['year'] == i) & (sales['month'] == y), 'x_28' + '_month_min'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i)].groupby(['id'])['x_28'].transform( lambda x: x.min()).astype("float32") sales['x_28' + '_month_max_to_min_diff'] = ( sales['x_28' + '_month_max'] - sales['x_28' + '_month_min']).astype("float32") for z in weeks_unq: sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( sales['wday'] == z), 'x_28' + '_wk_mean'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( ['id'])[ 'x_28'].transform(lambda x: x.mean()).astype("float32") sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( sales['wday'] == z), 'x_28' + '_wk_median'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( ['id'])[ 'x_28'].transform(lambda x: x.median()).astype("float32") sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( sales['wday'] == z), 'x_28' + '_wk_max'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( ['id'])[ 'x_28'].transform(lambda x: x.max()).astype("float32") sales.loc[(sales['year'] == i) & (sales['month'] == y) & ( sales['wday'] == z), 'x_28' + '_wk_min'] = \ sales.loc[(sales['month'] == y) & (sales['year'] == i) & (sales['wday'] == z)].groupby( ['id'])[ 'x_28'].transform(lambda x: x.min()).astype("float32") sales['x_28' + '_wk_max_to_min_diff'] = ( sales['x_28' + '_wk_max'] - sales['x_28' + '_wk_min']).astype("float32") return sales sales = preprocess_sales_2(sales) sales.to_csv("../input/sales_processed_f9.csv", index=True) sales[LIST_OF_FEATURE].to_csv( "../input/sales_processed_only_f9.csv", index=True) else: sales = pd.read_csv("../input/sales_processed_f9.csv", index_col=0) sales = sales.loc[:, ~sales.columns.isin( ['x_28', 'x_30', 'x_35', 'x_42', 'x_49', 'x_56', 'x_63'])] # sales = pd.read_csv("../input/sales_processed.csv", index_col=0) sales = LoadData.reduce_mem_usage(sales) self.FEATS = [] for lag in tqdm(LAGS): sales[f"x_{lag}"] = sales.groupby("id")["x"].shift(lag) self.FEATS.append(f"x_{lag}") # print('#' * 40) # print("SALES:", sales.isnull().any()) # self.scale2 = sales['scale1'].values self.scale = sales['scale1'].values.reshape((NITEMS, -1)) self.ids = sales['id'].values.reshape((NITEMS, -1)) # y = sales['demand'].values # ys = y / scale self.ys = sales[['x', 'sales1']].values.reshape((NITEMS, -1, 2)) # self.ys = sales['x'].values # feats_list = self.FEATS + LIST_OF_FEATURE feats_list = self.FEATS # arr_feat = sales[self.FEATS].values # arr_feature = sales[LIST_OF_FEATURE].values # arr_merge = np.concatenate([arr_feat, arr_feature], 1) # z_merge = sales[feats_list].values # print(np.array_equal(arr_merge, z_merge)) # print(arr_merge == z_merge) # print(np.all((arr_merge == z_merge) | (np.isnan(arr_merge) & np.isnan(z_merge)))) # print(type(arr_merge[0][0]), arr_merge[0][0]) # print(type(z_merge[0][0]), z_merge[0][0]) self.Z = sales[feats_list].values.reshape( (NITEMS, -1, len(feats_list))) # self.Z = sales[self.FEATS].values.reshape((NITEMS, -1, len(self.FEATS))) print(self.scale.shape, self.ids.shape, self.ys.shape, self.Z.shape) self.sv = self.scale[:, LEN - 56:LEN - 28] self.se = self.scale[:, LEN - 28:LEN] # self.sv = self.scale[self.val_mask] # self.se = self.scale[self.te_mask] # self.ids = self.ids[self.te_mask] # self.ids = self.ids.reshape((-1, 28)) # # self.ca = sales[['snap_CA']].values # self.tx = sales[['snap_TX']].values # self.wi = sales[['snap_WI']].values # self.wday = sales[['wday']].values # self.month = sales[['month']].values # self.year = sales[['year']].values # self.event = sales[['event_name']].values # self.nday = sales[['nday']].values # # self.item = sales[['item_id']].values # self.dept = sales[['dept_id']].values # self.cat = sales[['cat_id']].values # self.store = sales[['store_id']].values # self.state = sales[['state_id']].values self.C = sales[CATCOLS].values.reshape((NITEMS, -1, len(CATCOLS)))