def run(self): d = RawData() with timer("load calendar.csv"): d.calendar = pd.read_csv("./m5-forecasting-accuracy/calendar.csv").pipe( reduce_mem_usage ) with timer("load sales_train_validation.csv"): d.sales_train_validation = pd.read_csv( "./m5-forecasting-accuracy/sales_train_evaluation.csv" ).pipe(reduce_mem_usage) # with timer("convert christmas data to rmean"): # for d_str in d.calendar[d.calendar["date"].isin(events.christmas_dates)][ # "d" # ]: # d_int = int(d_str.replace("d_", "")) # d.sales_train_validation[d_str] = d.sales_train_validation[ # [f"d_{i}" for i in range(d_int - 15, d_int + 15) if i != d_int] # ].apply(lambda row: row.mean(), axis=1) with timer("load sample_submission.csv"): d.sample_submission = pd.read_csv( "./m5-forecasting-accuracy/sample_submission.csv" ).pipe(reduce_mem_usage) with timer("load sell_prices.csv"): d.sell_prices = pd.read_csv( "./m5-forecasting-accuracy/sell_prices.csv" ).pipe(reduce_mem_usage) self.dump(d)
def run(self): with timer("combine features"): with timer("concat features"): data: pd.DataFrame = pd.concat( [ self.load("data"), self.load("fe_price_rolling"), self.load("fe_price_change"), self.load("fe_price_basic"), self.load("fe_shift"), self.load("fe_rolling_mean"), self.load("fe_rolling_dw_mean"), self.load("fe_rolling_group_mean"), self.load("fe_rolling_group_std"), self.load("fe_rolling_std"), self.load("fe_rolling_skew"), self.load("fe_rolling_kurt"), self.load("te_data"), self.load("fe_catch22_pca"), self.load("fe_weather"), self.load("fe_unemployment"), self.load("fe_stock"), self.load("fe_event"), self.load("fe_event_strength"), ], axis=1, ) sp_idx: SplitIndex = self.load("sp_idx") sp: Split = Split() sp.train = data.iloc[sp_idx.train, :] sp.test = data.iloc[sp_idx.test, :] print(sp.train.info()) self.dump(sp)
def run(self): config = Config() data: pd.DataFrame = pd.concat( [self.load("data"), self.load("fe_event")], axis=1) train_df: pd.DataFrame = data[(data.d > config.START_DAY) & (data.d <= 1913)] # train_df = train_df.sample(int(len(train_df) * 0.15)) with timer("create grouped df"): grouped: List[Tuple[List[str], pd.DataFrame]] = target_encoding(train_df) with timer("merge into data"): for group_key, grouped_df in tqdm(grouped): data = data.merge(grouped_df, on=group_key, how="left") df = reduce_mem_usage(data.filter(like="fe_te_")) print(df.info()) self.dump(df)
def run(self): raw: RawData = self.load("raw") data: pd.DataFrame = self.load("data") raw.calendar["d"] = raw.calendar["d"].map( lambda d: int(d.replace("d_", ""))) raw.calendar["date_time"] = raw.calendar["date"] weather = read_weather_data() weather = weather[[ "date_time", "state_id", "fe_weather_mintempC", "fe_weather_maxtempC", "fe_weather_humidity", "fe_weather_sunHour", "fe_weather_cloudcover", ]] with timer("merge data"): data = data.merge(raw.calendar[["d", "date_time"]], on="d", how="left") data = data.merge(weather, on=["date_time", "state_id"], how="left") df = data.filter(like="fe_weather_") print(df.info()) self.dump(df)
def read_weather_data( external_data_path: str = "./external_data") -> pd.DataFrame: files: Dict[str, int] = { "californiaw.csv": 0, "texasw.csv": 1, "wisconsinw.csv": 2, } weather = pd.DataFrame() with timer("load weather data"): if os.path.exists(f"{external_data_path}/weather"): for file_name, state_id in files.items(): _tmp_weather = pd.read_csv( f"{external_data_path}/weather/{file_name}") _tmp_weather["state_id"] = state_id _tmp_weather["date_time"] = pd.to_datetime( _tmp_weather["date_time"]).dt.strftime("%Y-%m-%d") weather = pd.concat([weather, _tmp_weather], axis=0) del _tmp_weather weather.columns = [ f"fe_weather_{col}" if col not in ["date_time", "state_id"] else col for col in weather.columns ] print(weather.columns) return weather
def run(self): data: pd.DataFrame = self.load() with timer("make rolling_price_std_t7"): data["fe_rolling_price_std_t7"] = (data.groupby([ "id" ])["sell_price"].transform(lambda x: x.rolling(7).std()).astype( np.float16)) with timer("make rolling_price_std_t30"): data["fe_rolling_price_std_t30"] = (data.groupby([ "id" ])["sell_price"].transform(lambda x: x.rolling(30).std()).astype( np.float16)) df = data.filter(like="fe_rolling_price") print(df.info()) self.dump(df)
def run(self): with timer("combine val features"): with timer("concat features"): data: pd.DataFrame = pd.concat( [ self.load("data"), self.load("fe_price_rolling"), self.load("fe_price_change"), self.load("fe_price_basic"), self.load("fe_shift"), self.load("fe_rolling_mean"), self.load("fe_rolling_dw_mean"), self.load("fe_rolling_group_mean"), self.load("fe_rolling_group_std"), self.load("fe_rolling_std"), self.load("fe_rolling_skew"), self.load("fe_rolling_kurt"), self.load("fe_weather"), self.load("fe_unemployment"), self.load("fe_stock"), self.load("fe_event"), self.load("fe_event_strength"), self.load("fe_catch22_pca"), ], axis=1, ) with timer("merge target features"): config = Config() te_val_data: List[pd.DataFrame] = self.load("te_val_data") splits: List[Split] = [] sp_idxs: List[SplitIndex] = self.load("sp_idxs") for i in tqdm(range(len(sp_idxs))): sp: Split = Split() data = pd.concat([data, te_val_data[i]], axis=1) sp.train = data.iloc[sp_idxs[i].train, :] sp.test = data.iloc[sp_idxs[i].test, :] if config.CV_SAMPLE_RATE != 1: sp.train = sp.train.sample( int(len(sp.train) * config.CV_SAMPLE_RATE)) splits.append(sp) print(sp.train.info()) data = data.drop(list(data.filter(like="fe_te_").columns), axis=1) self.dump(splits)
def run(self): data: pd.DataFrame = self.load() with timer("make shift features"): for days in tqdm(list(range(5, 9)) + list(range(28, 43))): data[f"shift_t{days}"] = ( data.groupby(["id"])["sales"] .transform(lambda x: x.shift(days)) .astype(np.float16) ) df = data.filter(like="shift_t") print(df.info()) self.dump(df)
def run(self): config = Config() data: pd.DataFrame = pd.concat( [self.load("data"), self.load("fe_event")], axis=1) dfs: List[pd.DataFrame] = [] for end_day in config.CV_START_DAYS: with timer("create grouped df"): # train_df: pd.DataFrame = data[ # (data.d > config.START_DAY) & (data.d < end_day) # ] train_df: pd.DataFrame = data[data.d < end_day] grouped: List[Tuple[List[str], pd.DataFrame]] = target_encoding(train_df) with timer("merge into data"): df = data.copy() for group_key, grouped_df in tqdm(grouped): df = df.merge(grouped_df, on=group_key, how="left") df = reduce_mem_usage(df.filter(like="fe_te_")) print(df.info()) dfs.append(df) self.dump(dfs)
def target_encoding(train_df: pd.DataFrame) -> pd.DataFrame: group_keys = [ ["item_id"], ["item_id", "tm_w"], ["item_id", "tm_dw"], ["dept_id", "tm_w"], ["cat_id", "tm_w"], ["store_id", "dept_id"], ["store_id", "dept_id", "tm_w"], ["store_id", "dept_id", "tm_m"], ["store_id", "tm_w"], ["store_id", "tm_m"], ["store_id", "tm_d"], ["store_id", "snap"], ["store_id", "snap", "tm_dw"], ["state_id", "item_id"], ["state_id", "item_id", "tm_dw"], ["state_id", "item_id", "tm_w"], ["state_id", "item_id", "tm_m"], ["state_id", "item_id", "snap"], ["state_id", "item_id", "snap", "tm_dw"], ["state_id", "item_id", "fe_event"], ["state_id", "item_id", "fe_event_dw"], ["store_id", "item_id"], ["store_id", "item_id", "tm_dw"], ["store_id", "item_id", "tm_w"], ["store_id", "item_id", "tm_m"], ["store_id", "item_id", "tm_d"], ["store_id", "item_id", "snap"], ["store_id", "item_id", "snap", "tm_dw"], ["store_id", "item_id", "fe_event"], ["store_id", "item_id", "fe_event_dw"], ] result: List[Tuple[List[str], pd.DataFrame]] = [] methods = ["mean", "std"] with timer("target encoding"): for group_key in tqdm(group_keys): columns = [] columns += group_key columns += [ "fe_te_{}_{}".format("_".join(group_key), method) for method in methods ] tmp_df = (train_df[group_key + ["sales"]].groupby(group_key).agg({ "sales": methods }).reset_index()) tmp_df.columns = columns tmp_df.reset_index(inplace=True, drop=True) result.append((group_key, tmp_df)) return result
def run(self): data: pd.DataFrame = self.load() with timer("make rolling features"): for lag in [28]: for w_size in tqdm([30]): data[f"fe_rolling_kurt_t{lag}_{w_size}"] = ( data.groupby(["id"])["sales"] .transform(lambda x: x.shift(lag).rolling(w_size).kurt()) .astype(np.float16) ) df = data.filter(like="fe_rolling_kurt") print(df.info()) self.dump(df)
def run(self): data: pd.DataFrame = self.load() with timer("make rolling mean"): lag_wsize = [] for lag in [1, 14, 7, 28]: for w_size in [7, 30, 60, 90, 180]: lag_wsize.append([data[["id", "d", "sales"]], lag, w_size, "mean"]) data = pd.concat( [data, df_parallelize_run(make_lag_roll, lag_wsize)], axis=1 ) df = data.filter(like="fe_rolling_mean") print(df.info()) self.dump(df)
def run(self): data: pd.DataFrame = self.load() with timer("make price lag_1 features"): data["lag_price_t1"] = data.groupby( ["id"])["sell_price"].transform(lambda x: x.shift(1)) data["fe_price_change_t1"] = (data["lag_price_t1"] - data["sell_price"]) / ( data["lag_price_t1"]) data.drop("lag_price_t1", axis=1, inplace=True) with timer("make price lag_365 features"): data["rolling_price_max_t365"] = data.groupby([ "id" ])["sell_price"].transform(lambda x: x.shift(1).rolling(365).max()) data["fe_price_change_t365"] = (data["rolling_price_max_t365"] - data["sell_price"]) / ( data["rolling_price_max_t365"]) data.drop("rolling_price_max_t365", axis=1, inplace=True) df = data.filter(like="fe_price_change") print(df.info()) self.dump(df)
def cls_postprocessing(cv_num: int, test_pred: pd.DataFrame) -> pd.DataFrame: with timer("cls_postprocessing"): config = Config() df_val: pd.dataframe = pickle.load( open(f"./output/cv_cls/{config.CLS_TIMESTAMP}/0/df_val.pkl", "rb") ) test_pred["tmp_id"] = ( test_pred["id"].astype(str) + "_" + test_pred["d"].astype(str) ) df_val = df_val[df_val["sales_is_zero_pred"] >= config.CLS_THRESHOLD] tmp_ids = df_val["id"].astype(str) + "_" + df_val["d"].astype(str) test_pred.loc[test_pred["tmp_id"].isin(tmp_ids), "sales"] = 0 test_pred.drop(["tmp_id"], axis=1, inplace=True) return test_pred
def run(self): raw: RawData = self.load("raw") data: pd.DataFrame = self.load("data") raw.calendar["d"] = raw.calendar["d"].map(lambda d: int(d.replace("d_", ""))) unemployment = read_unemployment_data(date_range=raw.calendar[["date"]]) with timer("merge data"): data = data.merge(raw.calendar[["d", "date"]], on="d", how="left").merge( unemployment, on=["date", "state_id"], how="left" ) df = data.filter(like="fe_unemployment") print(df.info()) self.dump(df)
def make_lag_roll(LAG_WSIZE: List[Any]): df: pd.DataFrame = LAG_WSIZE[0] lag = LAG_WSIZE[1] w_size = LAG_WSIZE[2] method: str = LAG_WSIZE[3] # group_ids: List[str] = df.drop(["id", "d", "sales"]).columns.tolist() print(lag, w_size, method) col_name: str = "" if method == "group_mean": pass # col_name = "fe_rolling_{}_mean_{}_{}".format("_".join(group_ids), lag, w_size) # with timer("create {}".format(col_name)): # _tmp = df.groupby(["d"] + group_ids)["sales"].mean().reset_index() # _tmp[col_name] = _tmp.groupby(group_ids)["sales"].transform( # lambda x: x.shift(lag).rolling(w_size).mean() # ) # _tmp.drop("sales", axis=1, inplace=True) # df = df.merge(_tmp, on=["d"] + group_ids, how="left") else: col_name = f"fe_rolling_{method}_t{lag}_{w_size}" with timer("create {}".format(col_name)): if method == "mean": df[col_name] = ( df.groupby("id")["sales"] .transform(lambda x: x.shift(lag).rolling(w_size).mean()) .astype(np.float16) ) if method == "std": df[col_name] = ( df.groupby("id")["sales"]["sales"] .transform(lambda x: x.shift(lag).rolling(w_size).std()) .astype(np.float16) ) if method == "dw_mean": df[col_name] = ( df.groupby(["id", "tm_dw"])["sales"] .transform(lambda x: x.shift(lag).rolling(w_size).mean()) .astype(np.float16) ) return df[[col_name]]
def read_unemployment_data( date_range: pd.DataFrame, external_data_path: str = "./external_data" ) -> pd.DataFrame: files: Dict[str, int] = { "CA.csv": 0, "TX.csv": 1, "WI.csv": 2, } unemployment: pd.DataFrame = pd.DataFrame() with timer("load unemployment data"): if os.path.exists(f"{external_data_path}/unemployment"): for file_name, state_id in files.items(): _tmp_unemployment = pd.read_csv( f"{external_data_path}/unemployment/{file_name}" ) _tmp_unemployment["date"] = pd.to_datetime( _tmp_unemployment["DATE"] ).dt.strftime("%Y-%m-%d") _tmp_unemployment.drop("DATE", axis=1, inplace=True) _tmp_unemployment.rename( {"{}UR".format(file_name.replace(".csv", "")): "fe_unemployment"}, axis=1, inplace=True, ) _tmp_unemployment = date_range.merge( _tmp_unemployment, on="date", how="left" ) _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[ "fe_unemployment" ].interpolate() _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[ "fe_unemployment" ].fillna(method="bfill") _tmp_unemployment["state_id"] = state_id unemployment = pd.concat([unemployment, _tmp_unemployment], axis=0) del _tmp_unemployment return unemployment
def train( cv_num: int, params: Dict[str, Any], train_set: lgb.Dataset, valid_sets: List[lgb.Dataset], verbose_eval: int, early_stopping_rounds: Optional[int] = None, model_number: Optional[int] = None, ) -> lgb.Booster: config = Config() timer_name: str = f"train CV_{cv_num}" if model_number: timer_name += f"_{model_number}" with timer(timer_name, mlflow_on=True): model = lgb.train( params, train_set, num_boost_round=config.num_boost_round, verbose_eval=verbose_eval, # early_stopping_rounds=early_stopping_rounds, valid_sets=valid_sets, ) return model
def run(self): data: pd.DataFrame = self.load() groups = [ ["item_id"], ["store_id"], ["state_id", "item_id"], ] with timer("make group std"): for group in tqdm(groups): for lag in tqdm([28]): for w_size in tqdm([7, 30, 180]): col_name = "fe_rolling_{}_std_{}_{}".format( "_".join(group), lag, w_size ) _tmp = data.groupby(["d"] + group)["sales"].mean().reset_index() _tmp[col_name] = _tmp.groupby(group)["sales"].transform( lambda x: x.shift(lag).rolling(w_size).std() ) _tmp.drop("sales", axis=1, inplace=True) data = data.merge(_tmp, on=["d"] + group, how="left") df = data.filter(like="fe_rolling_") print(df.info()) self.dump(df)
def run(self): raw: RawData = self.load("raw") data: pd.DataFrame = self.load("data") raw.calendar["d"] = raw.calendar["d"].map( lambda d: int(d.replace("d_", ""))) df = pd.DataFrame() if os.path.isfile("./external_data/stock.csv"): stock = pd.read_csv("./external_data/stock.csv") stock.columns = [ "date", "close_last", "volume", "open", "high", "low" ] stock["date"] = pd.to_datetime( stock["date"]).dt.strftime("%Y-%m-%d") for col in ["close_last", "open", "high", "low"]: stock[col] = stock[col].map( lambda x: float(x.replace("$", ""))) stock = stock[["date", "close_last", "volume"]] stock.columns = ["date", "fe_stock_price", "fe_stock_volume"] stock = raw.calendar[["date"]].merge(stock, on="date", how="left") stock["fe_stock_price"] = (stock["fe_stock_price"].fillna( method="ffill").fillna(method="bfill")) stock["fe_stock_volume"] = (stock["fe_stock_volume"].fillna( method="ffill").fillna(method="bfill")) with timer("merge data"): data = data.merge(raw.calendar[["d", "date"]], on="d", how="left").merge(stock, on="date", how="left") df = data.filter(like="fe_stock") print(df.info()) self.dump(df)
def train_cls( cv_num: int, params: Dict[str, Any], train_set: lgb.Dataset, valid_sets: List[lgb.Dataset], verbose_eval: int, early_stopping_rounds: Optional[int] = None, model_number: Optional[int] = None, ) -> LGBMClassifier: config = Config() timer_name: str = f"train CV_{cv_num}" if model_number: timer_name += f"_{model_number}" with timer(timer_name, mlflow_on=True): model = LGBMClassifier(**config.lgbm_cls_params) model.fit( train_set.data, train_set.label, categorical_feature=config.lgbm_cat_features, eval_set=[(dataset.data, dataset.label) for dataset in valid_sets], eval_metric="logloss,auc,cross_entropy", verbose=10, ) return model
import sklearn.preprocessing import sklearn.cluster import sys import os sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.utils import timer tb = Thunderbolt("./../../resource") data: pd.DataFrame = tb.get_data("MakeData") data = data[data.d < 1942] # %% with timer("calc grouped aggregates"): grouped = data.groupby(["id"])["sales"].agg({ "mean": lambda x: x.dropna().values.mean(), "percentile25": lambda x: x.dropna().sort_values()[:int(len(x) * 0.25)].mean(), "percentile50": lambda x: x.dropna().sort_values()[int(len(x) * 0.25):int( len(x) * 0.5)].mean(), "percentile75": lambda x: x.dropna().sort_values()[int(len(x) * 0.5):int( len(x) * 0.75)].mean(), "percentile100": lambda x: x.dropna().sort_values()[int(len(x) * 0.75):].mean(), "std": lambda x: x.dropna().values.std(),
# %% import pandas as pd import numpy as np import matplotlib.pyplot as plt from thunderbolt import Thunderbolt import sys import os sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.utils import timer tb = Thunderbolt("./../../resource") data: pd.DataFrame = tb.get_data("MakeData") # %% with timer("calc rolling_store_id_cat_id_mean"): lag = 28 w_size = 30 data["fe_rolling_store_id_cat_id_mean"] = data.groupby([ "store_id", "cat_id" ])["sales"].transform(lambda x: x.shift(lag).rolling(w_size).mean()) # %% tb = Thunderbolt("./../../resource") tb.get_data("FERollingGroupMean")
def run(self): raw: RawData = self.load() id_vars = [ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ] with timer("melt sales_train_validation"): data: pd.DataFrame = pd.melt( raw.sales_train_validation, id_vars=id_vars, var_name="d", value_name="sales", ) print_mem_usage(data) with timer("add test data"): add_df = pd.DataFrame() for i in tqdm(range(1, 29)): tmp_df = raw.sales_train_validation[id_vars].drop_duplicates() tmp_df["d"] = f"d_{1941+i}" tmp_df["sales"] = np.nan add_df = pd.concat([add_df, tmp_df]) data = pd.concat([data, add_df]).reset_index(drop=True) del add_df print_mem_usage(data) with timer("str to category"): for col in tqdm(id_vars): data[col] = data[col].astype("category") print_mem_usage(data) with timer("merge release"): data = merge_by_concat( data, raw.sell_prices.groupby( ["store_id", "item_id"])["wm_yr_wk"].agg(release=np.min).reset_index(), ["store_id", "item_id"], ) print_mem_usage(data) with timer("merge wm_yr_wk"): data = merge_by_concat(data, raw.calendar[["wm_yr_wk", "d"]], ["d"]) print_mem_usage(data) with timer("cutoff data before release"): data = data[data["wm_yr_wk"] >= data["release"]].reset_index( drop=True) print_mem_usage(data) reduce_mem_usage(data) with timer("make calendar events"): raw.calendar["cal_blackfriday"] = ( raw.calendar["date"].str[5:].isin([ "2011-11-25", "2012-11-23", "2013-11-29", "2014-11-28", "2015-11-27", ])).astype(np.int8) raw.calendar.loc[raw.calendar["cal_blackfriday"] == 1, "event_name_1"] = "BlackFriday" raw.calendar.loc[raw.calendar["cal_blackfriday"] == 1, "event_type_1"] = "other" with timer("merge calendar"): icols = [ "event_name_1", "event_type_1", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI", ] data = data.merge( raw.calendar.drop( [ "wm_yr_wk", "weekday", "wday", "month", "year", "cal_blackfriday" ], axis=1, ), on=["d"], how="left", ) for col in tqdm(icols): data[col].fillna("unknown", inplace=True) data[col] = data[col].astype("category") data["date"] = pd.to_datetime(data["date"]) print_mem_usage(data) with timer("make snap"): data["snap"] = 0 data.loc[(data.snap_CA == 1) & (data.state_id == "CA"), "snap"] = 1 data.loc[(data.snap_TX == 1) & (data.state_id == "TX"), "snap"] = 1 data.loc[(data.snap_WI == 1) & (data.state_id == "WI"), "snap"] = 1 with timer("make some features from date"): data["tm_d"] = data["date"].dt.day.astype(np.int8) data["tm_w"] = data["date"].dt.week.astype(np.int8) data["tm_m"] = data["date"].dt.month.astype(np.int8) data["tm_y"] = data["date"].dt.year data["tm_quarter"] = data["date"].dt.quarter.astype(np.int8) data["tm_y"] = (data["tm_y"] - data["tm_y"].min()).astype(np.int8) data["tm_wm"] = data["tm_d"].apply( lambda x: np.ceil(x / 7)).astype(np.int8) data["tm_dw"] = data["date"].dt.dayofweek.astype(np.int8) data["tm_w_end"] = (data["tm_dw"] >= 5).astype(np.int8) # data["tm_moon_phase"] = ( # data["date"].map(lambda d: get_moon_phase(d)).astype(np.int8) # ) data.loc[data["event_type_1"] == "National", "tm_w_end"] = 1 del data["date"] print_mem_usage(data) with timer("merge sell_prices"): data = data.merge(raw.sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left") with timer("convert 'd' to int"): data["d"] = data["d"].apply(lambda x: x[2:]).astype(np.int16) data["sales_is_zero"] = (data["sales"] == 0).astype(np.int8) print_mem_usage(data) with timer("label encoding"): cat_encoders: Dict[str, sklearn.preprocessing.LabelEncoder] = {} cat_features: List[str] = [ "item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_type_1", "event_name_2", "event_type_2", ] for feature in tqdm(cat_features): encoder = sklearn.preprocessing.LabelEncoder() encoder.fit(data[feature]) data[feature] = encoder.transform(data[feature]) cat_encoders[feature] = encoder pickle.dump(cat_encoders, open("./cat_encoders.pkl", "wb")) print(data.info()) self.dump(data)
def run(self): data: pd.DataFrame = self.load() data = data[data.d < 1942] with timer("calc grouped aggregates"): catch22_df = data.groupby(["id"])["sales"].agg( mean=lambda x: x.dropna().values.mean(), percentile25=lambda x: x.dropna().sort_values()[:int( len(x) * 0.25)].mean(), percentile50=lambda x: x.dropna().sort_values()[int( len(x) * 0.25):int(len(x) * 0.5)].mean(), percentile75=lambda x: x.dropna().sort_values()[int( len(x) * 0.5):int(len(x) * 0.75)].mean(), percentile100=lambda x: x.dropna().sort_values()[int( len(x) * 0.75):].mean(), std=lambda x: x.dropna().values.std(), CO_Embed2_Dist_tau_d_expfit_meandiff=lambda x: catch22. CO_Embed2_Dist_tau_d_expfit_meandiff(x.dropna().tolist()), CO_f1ecac=lambda x: catch22.CO_f1ecac(x.dropna().tolist()), CO_FirstMin_ac=lambda x: catch22.CO_FirstMin_ac(x.dropna(). tolist()), CO_HistogramAMI_even_2_5=lambda x: catch22. CO_HistogramAMI_even_2_5(x.dropna().tolist()), CO_trev_1_num=lambda x: catch22.CO_trev_1_num(x.dropna(). tolist()), DN_HistogramMode_10=lambda x: catch22.DN_HistogramMode_10( x.dropna().tolist()), DN_HistogramMode_5=lambda x: catch22.DN_HistogramMode_5( x.dropna().tolist()), DN_OutlierInclude_n_001_mdrmd=lambda x: catch22. DN_OutlierInclude_n_001_mdrmd(x.dropna().tolist()), DN_OutlierInclude_p_001_mdrmd=lambda x: catch22. DN_OutlierInclude_p_001_mdrmd(x.dropna().tolist()), FC_LocalSimple_mean1_tauresrat=lambda x: catch22. FC_LocalSimple_mean1_tauresrat(x.dropna().tolist()), FC_LocalSimple_mean3_stderr=lambda x: catch22. FC_LocalSimple_mean3_stderr(x.dropna().tolist()), IN_AutoMutualInfoStats_40_gaussian_fmmi=lambda x: catch22. IN_AutoMutualInfoStats_40_gaussian_fmmi(x.dropna().tolist()), MD_hrv_classic_pnn40=lambda x: catch22.MD_hrv_classic_pnn40( x.dropna().tolist()), PD_PeriodicityWang_th0_01=lambda x: catch22. PD_PeriodicityWang_th0_01(x.dropna().tolist()), SB_BinaryStats_diff_longstretch0=lambda x: catch22. SB_BinaryStats_diff_longstretch0(x.dropna().tolist()), SB_BinaryStats_mean_longstretch1=lambda x: catch22. SB_BinaryStats_mean_longstretch1(x.dropna().tolist()), SB_MotifThree_quantile_hh=lambda x: catch22. SB_MotifThree_quantile_hh(x.dropna().tolist()), SB_TransitionMatrix_3ac_sumdiagcov=lambda x: catch22. SB_TransitionMatrix_3ac_sumdiagcov(x.dropna().tolist()), SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1=lambda x: catch22. SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1(x.dropna().tolist()), SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1=lambda x: catch22. SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1(x.dropna().tolist( )), SP_Summaries_welch_rect_area_5_1=lambda x: catch22. SP_Summaries_welch_rect_area_5_1(x.dropna().tolist()), SP_Summaries_welch_rect_centroid=lambda x: catch22. SP_Summaries_welch_rect_centroid(x.dropna().tolist()), ) print(catch22_df.info()) self.dump(catch22_df)
def run(self): data: pd.DataFrame = self.load("data") raw: RawData = self.load("raw") prices_df: pd.DataFrame = raw.sell_prices.copy() with timer("basic price aggregations"): prices_df["fe_price_max"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform(np.max) prices_df["fe_price_min"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform(np.min) prices_df["fe_price_std"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform(np.std) prices_df["fe_price_mean"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform(np.mean) prices_df["fe_price_discount"] = (prices_df["fe_price_mean"] - prices_df["sell_price"]) prices_df["fe_price_discount_rate"] = ( prices_df["fe_price_discount"] / prices_df["fe_price_mean"]) prices_df["fe_price_skew"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform(lambda x: x.skew()) prices_df["fe_price_kurt"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform(lambda x: x.kurt()) prices_df["fe_price_norm"] = (prices_df["sell_price"] / prices_df["fe_price_max"]) prices_df["fe_price_nunique"] = prices_df.groupby( ["store_id", "item_id"])["sell_price"].transform("nunique") prices_df["fe_price_item_nunique"] = prices_df.groupby( ["store_id", "sell_price"])["item_id"].transform("nunique") prices_df = prices_df.merge( raw.calendar[["wm_yr_wk", "month", "year"]].drop_duplicates(subset=["wm_yr_wk"]), on=["wm_yr_wk"], how="left", ) with timer("calc price momentum"): prices_df["fe_price_momentum"] = prices_df[ "sell_price"] / prices_df.groupby([ "store_id", "item_id" ])["sell_price"].transform(lambda x: x.shift(1)) prices_df["fe_price_momentum_m"] = prices_df[ "sell_price"] / prices_df.groupby([ "store_id", "item_id", "month" ])["sell_price"].transform("mean") prices_df["fe_price_momentum_y"] = prices_df[ "sell_price"] / prices_df.groupby([ "store_id", "item_id", "year" ])["sell_price"].transform("mean") del prices_df["month"], prices_df["year"] with timer("merge prices_df"): cat_encoders: Dict[ str, sklearn.preprocessing.LabelEncoder] = pickle.load( open("./cat_encoders.pkl", "rb")) for col in ["store_id", "item_id"]: prices_df[col] = cat_encoders[col].transform(prices_df[col]) data = data.merge(prices_df, on=["store_id", "item_id", "wm_yr_wk"], how="left") df = data.filter(like="fe_price") df = reduce_mem_usage(df) print(df.info()) self.dump(df)
def target_encoding_catch22(train_df: pd.DataFrame) -> pd.DataFrame: group_keys = [ ["store_id", "item_id"], ] result: List[Tuple[List[str], pd.DataFrame]] = [] with timer("target encoding"): for group_key in tqdm(group_keys): with timer("{} te".format(str(group_key))): tmp_df = train_df.groupby(group_key)["sales"].agg({ "fe_te_CO_Embed2_Dist_tau_d_expfit_meandiff": lambda x: catch22.CO_Embed2_Dist_tau_d_expfit_meandiff( x.tolist()), "fe_te_CO_f1ecac": lambda x: catch22.CO_f1ecac(x.tolist()), "fe_te_CO_FirstMin_ac": lambda x: catch22.CO_FirstMin_ac(x.tolist()), "fe_te_CO_HistogramAMI_even_2_5": lambda x: catch22.CO_HistogramAMI_even_2_5(x.tolist()), "fe_te_CO_trev_1_num": lambda x: catch22.CO_trev_1_num(x.tolist()), "fe_te_DN_HistogramMode_10": lambda x: catch22.DN_HistogramMode_10(x.tolist()), "fe_te_DN_HistogramMode_5": lambda x: catch22.DN_HistogramMode_5(x.tolist()), "fe_te_DN_OutlierInclude_n_001_mdrmd": lambda x: catch22.DN_OutlierInclude_n_001_mdrmd(x.tolist() ), "fe_te_DN_OutlierInclude_p_001_mdrmd": lambda x: catch22.DN_OutlierInclude_p_001_mdrmd(x.tolist() ), "fe_te_FC_LocalSimple_mean1_tauresrat": lambda x: catch22.FC_LocalSimple_mean1_tauresrat(x.tolist( )), "fe_te_FC_LocalSimple_mean3_stderr": lambda x: catch22.FC_LocalSimple_mean3_stderr(x.tolist()), "fe_te_IN_AutoMutualInfoStats_40_gaussian_fmmi": lambda x: catch22.IN_AutoMutualInfoStats_40_gaussian_fmmi( x.tolist()), "fe_te_MD_hrv_classic_pnn40": lambda x: catch22.MD_hrv_classic_pnn40(x.tolist()), "fe_te_PD_PeriodicityWang_th0_01": lambda x: catch22.PD_PeriodicityWang_th0_01(x.tolist()), "fe_te_SB_BinaryStats_diff_longstretch0": lambda x: catch22.SB_BinaryStats_diff_longstretch0( x.tolist()), "fe_te_SB_BinaryStats_mean_longstretch1": lambda x: catch22.SB_BinaryStats_mean_longstretch1( x.tolist()), "fe_te_SB_MotifThree_quantile_hh": lambda x: catch22.SB_MotifThree_quantile_hh(x.tolist()), "fe_te_SB_TransitionMatrix_3ac_sumdiagcov": lambda x: catch22.SB_TransitionMatrix_3ac_sumdiagcov( x.tolist()), "fe_te_SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1": lambda x: catch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1( x.tolist()), "fe_te_SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1": lambda x: catch22. SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1(x.tolist()), "fe_te_SP_Summaries_welch_rect_area_5_1": lambda x: catch22.SP_Summaries_welch_rect_area_5_1( x.tolist()), "fe_te_SP_Summaries_welch_rect_centroid": lambda x: catch22.SP_Summaries_welch_rect_centroid( x.tolist()), }) tmp_df.reset_index(inplace=True, drop=True) result.append((group_key, tmp_df)) return result