def run(self): required_columns = {self.index_columns, self.target_column} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) dataset = reduce_mem_usage(dataset) self.dump(dataset)
def run(self): required_columns = {self.index_columns, self.target_column, "target"} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) train = dataset.loc[dataset[self.predict_column].notna(), self.target_column] test = dataset.loc[dataset[self.predict_column].isna(), self.target_column] categories = train.dropna().unique() train_dummied = pd.get_dummies( pd.Categorical(train, categories), prefix="OHE_" + self.target_column, dummy_na=True, ) train_dummied.index = train.index test_dummied = pd.get_dummies( pd.Categorical(test, categories), prefix="OHE_" + self.target_column, dummy_na=True, ) test_dummied.index = test.index result = reduce_mem_usage( pd.concat([train_dummied, test_dummied]).sort_index()) self.dump(result)
def run(self): required_columns = {self.index_columns, "month"} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) dataset["month_sin"] = np.sin(2 * np.pi * dataset["month"] / 12) dataset["month_cos"] = np.cos(2 * np.pi * dataset["month"] / 12) dataset = dataset[["month_sin", "month_cos"]].fillna(-10) dataset = reduce_mem_usage(dataset) self.dump(dataset)
def run(self): required_columns = {self.index_columns, "ord_0"} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) dataset = dataset.fillna(0) dataset["ord_0"] = dataset["ord_0"].astype(int) dataset = dataset.rename(columns={"ord_0": "Ordinary_ord_0"}) dataset = reduce_mem_usage(dataset) self.dump(dataset)
def run(self): required_columns = {self.index_columns, self.target_column} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) dataset[self.target_column] = dataset[self.target_column].map( self.ordinary_map) dataset = dataset.fillna(0) dataset[self.target_column] = dataset[self.target_column].astype(int) dataset = dataset.rename( columns={self.target_column: "Ordinary_" + self.target_column}) dataset = reduce_mem_usage(dataset) self.dump(dataset)
def set_index(self, data: pd.DataFrame) -> pd.DataFrame: """ to_history_date以降のデータに対してindexをsetして、メモリを削減したデータフレームを返す。 Args: data (pd.DataFrame): 特徴量のデータフレーム Returns: pd.DataFrame: """ data = data.query(f"d > {self.to_history_date}") data = data.set_index(self.index_columns) data = reduce_mem_usage(data) return data
def run(self): required_columns = {self.index_columns, "ord_5"} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) map_ord5 = { key: value + 1 for value, key in enumerate( sorted(dataset["ord_5"].dropna().unique())) } dataset["Ordinary_ord_5"] = dataset["ord_5"].map(map_ord5) dataset = dataset[["Ordinary_ord_5"]] dataset = dataset.fillna(0).astype(int) dataset = reduce_mem_usage(dataset) self.dump(dataset)
def run(self): required_columns = {self.index_columns, self.target_column} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) dataset[self.target_column] = ( dataset[self.target_column].astype(str).fillna("nan")) encoder = LabelEncoder() dataset[self.target_column] = encoder.fit_transform( dataset[self.target_column]) dataset = dataset.rename( columns={ self.target_column: "BinaryCategorical_" + self.target_column }) dataset = reduce_mem_usage(dataset) self.dump(dataset)
def run(self): required_columns = {self.index_columns, self.target_column, "target"} dataset = self.load_data_frame(required_columns=required_columns, drop_columns=True) dataset = dataset.set_index(self.index_columns) train = dataset[dataset[self.predict_column].notna()] encoding = ( train[self.target_column].value_counts().reset_index().rename( columns={ self.target_column: "count_encode_" + self.target_column, "index": self.target_column, })) result = ((dataset[[self.target_column]].reset_index().merge( encoding, on=self.target_column, how="left")).fillna(-10).drop( columns=self.target_column).set_index(self.index_columns)) result = reduce_mem_usage(result) self.dump(result)
def run(self): self.target_columns = self.target_columns.split(",") encoder = self.get_encoder() encoder_for_test = deepcopy(encoder) dataset: pd.DataFrame = self.load_data_frame("dataset").set_index( self.index_columns) fold = self.load("fold") train = dataset[dataset[self.predict_column].notna()] train_y = train[self.predict_column] test = dataset[dataset[self.predict_column].isna()] encoded_train: pd.DataFrame = pd.DataFrame() for trn_idx, val_idx in fold.split(train, train_y): encoder.fit(train.iloc[trn_idx], train_y.iloc[trn_idx]) encoded_train = pd.concat([ encoded_train, encoder.transform(train.iloc[val_idx])[self.target_columns], ]) encoder_for_test.fit(train, train_y) encoded_test = encoder_for_test.transform(test) encoded_dataset = pd.concat([encoded_train, encoded_test ])[self.target_columns].sort_index() rename_map = { col: encoder.__class__.__name__ + "_" + col for col in self.target_columns } encoded_dataset = encoded_dataset.rename(columns=rename_map) encoded_dataset = reduce_mem_usage(encoded_dataset) self.dump(encoded_dataset)
def __init__(self, load=False, debug=True): def encode_categorical(df, cols): for col in cols: # Leave NaN as it is. le = LabelEncoder() df[col] = df[col].fillna("nan") df[col] = pd.Series(le.fit_transform(df[col]), index=df.index) return df def weight_calc(weight_mat_csr, data, product): # calculate the denominator of RMSSE, and calculate the weight base on sales amount sales_train_val = pd.read_csv( "../input/m5-forecasting-accuracy/sales_train_validation.csv") d_name = ["d_" + str(i + 1) for i in range(1913)] sales_train_val = weight_mat_csr * sales_train_val[d_name].values # calculate the start position(first non-zero demand observed date) for each item / 商品の最初の売上日 # 1-1914のdayの数列のうち, 売上が存在しない日を一旦0にし、0を9999に置換。そのうえでminimum numberを計算 df_tmp = (sales_train_val > 0) * np.tile(np.arange( 1, 1914), (weight_mat_csr.shape[0], 1)) start_no = np.min(np.where(df_tmp == 0, 9999, df_tmp), axis=1) - 1 flag = (np.dot( np.diag(1 / (start_no + 1)), np.tile(np.arange(1, 1914), (weight_mat_csr.shape[0], 1)), ) < 1) sales_train_val = np.where(flag, np.nan, sales_train_val) # denominator of RMSSE / RMSSEの分母 weight1 = np.nansum(np.diff(sales_train_val, axis=1)**2, axis=1) / (1913 - start_no) # calculate the sales amount for each item/level df_tmp = data[(data["date"] > "2016-03-27") & (data["date"] <= "2016-04-24")] df_tmp["amount"] = df_tmp["demand"] * df_tmp["sell_price"] df_tmp = df_tmp.groupby(["id"])["amount"].apply(np.sum) df_tmp = df_tmp[product.id].values weight2 = weight_mat_csr * df_tmp weight2 = weight2 / np.sum(weight2) del sales_train_val gc.collect() return weight1, weight2 if load: print("loadするぜ") self.data = pd.read_pickle("./evaluator_data.pkl") self.weight_mat_csr = sparse.load_npz( "./evaluator_weight_mat_csr.npz") self.weight1 = np.load("./evaluator_weight1.npy", allow_pickle=True) self.weight2 = np.load("./evaluator_weight2.npy", allow_pickle=True) else: print("read_csv中") calendar = pd.read_csv( "../input/m5-forecasting-accuracy/calendar.csv") sell_prices = pd.read_csv( "../input/m5-forecasting-accuracy/sell_prices.csv") sales_train_val = pd.read_csv( "../input/m5-forecasting-accuracy/sales_train_validation.csv") submission = pd.read_csv( "../input/m5-forecasting-accuracy/sample_submission.csv") print("encode中") # encode for memory calendar = encode_categorical( calendar, [ "event_name_1", "event_type_1", "event_name_2", "event_type_2" ], ).pipe(reduce_mem_usage) sales_train_val = encode_categorical( sales_train_val, ["item_id", "dept_id", "cat_id", "store_id", "state_id"], ).pipe(reduce_mem_usage) sell_prices = encode_categorical( sell_prices, ["item_id", "store_id"]).pipe(reduce_mem_usage) product = sales_train_val[[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ]].drop_duplicates() # to remove data before first non-zero demand date, replace these demand as np.nan. d_name = ["d_" + str(i + 1) for i in range(1913)] sales_train_val_values = sales_train_val[d_name].values # calculate the start position(first non-zero demand observed date) for each item / 商品の最初の売上日 # 1-1914のdayの数列のうち, 売上が存在しない日を一旦0にし、0を9999に置換。そのうえでminimum numberを計算 tmp = np.tile(np.arange(1, 1914), (sales_train_val_values.shape[0], 1)) df_tmp = (sales_train_val_values > 0) * tmp start_no = np.min(np.where(df_tmp == 0, 9999, df_tmp), axis=1) - 1 flag = np.dot(np.diag(1 / (start_no + 1)), tmp) < 1 sales_train_val_values = np.where(flag, np.nan, sales_train_val_values) sales_train_val[d_name] = sales_train_val_values del tmp, sales_train_val_values sales_train_val = pd.melt( sales_train_val, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="day", value_name="demand", ) if debug: nrows = 365 * 2 * NUM_ITEMS sales_train_val = sales_train_val.iloc[-nrows:, :] print("data計算中") sales_train_val = sales_train_val[~sales_train_val["demand"]. isnull()] # submission fileのidのvalidation部分と, ealuation部分の名前を取得 test1_rows = [ row for row in submission["id"] if "validation" in row ] test2_rows = [ row for row in submission["id"] if "evaluation" in row ] # submission fileのvalidation部分をtest1, ealuation部分をtest2として取得 test1 = submission[submission["id"].isin(test1_rows)] test2 = submission[submission["id"].isin(test2_rows)] # test1, test2の列名の"F_X"の箇所をd_XXX"の形式に変更 test1.columns = ["id"] + [ f"d_{d}" for d in range(1914, 1914 + DAYS_PRED) ] test2.columns = ["id"] + [ f"d_{d}" for d in range(1942, 1942 + DAYS_PRED) ] # test2のidの'_evaluation'を置換 test2["id"] = test2["id"].str.replace("_evaluation", "_validation") # idをキーにして, idの詳細部分をtest1, test2に結合する. test1 = test1.merge(product, how="left", on="id") test2 = test2.merge(product, how="left", on="id") # test1, test2をともにmelt処理する.(売上数量:demandは0) test1 = pd.melt( test1, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="day", value_name="demand", ) test2 = pd.melt( test2, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="day", value_name="demand", ) # validation部分と, evaluation部分がわかるようにpartという列を作り、 test1,test2のラベルを付ける。 sales_train_val["part"] = "train" test1["part"] = "test1" test2["part"] = "test2" # sales_train_valとtest1, test2の縦結合. data = pd.concat([sales_train_val, test1, test2], axis=0) # memoryの開放 del sales_train_val, test1, test2 # delete test2 for now(6/1以前は, validation部分のみ提出のため.) data = data[data["part"] != "test2"] # drop some calendar features(不要な変数の削除:weekdayやwdayなどはdatetime変数から後ほど作成できる。) calendar.drop(["weekday", "wday", "month", "year"], inplace=True, axis=1) # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)(dayとdをキーにdataに結合) data = pd.merge(data, calendar, how="left", left_on=["day"], right_on=["d"]) data.drop(["d", "day"], inplace=True, axis=1) # memoryの開放 del calendar # sell priceの結合 # get the sell price data (this feature should be very important) data = data.merge(sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left") print( "Our final dataset to train has {} rows and {} columns".format( data.shape[0], data.shape[1])) # memoryの開放 del sell_prices self.data = reduce_mem_usage(data) self.data.to_pickle("evaluator_data.pkl") print("weight計算中") weight_mat = np.c_[ np.ones([NUM_ITEMS, 1]).astype(np.int8), # level 1 pd.get_dummies(product.state_id.astype(str), drop_first=False ).astype("int8").values, pd.get_dummies(product.store_id.astype(str), drop_first=False ).astype("int8").values, pd.get_dummies(product.cat_id.astype(str), drop_first=False ).astype("int8").values, pd.get_dummies(product.dept_id.astype(str), drop_first=False ).astype("int8").values, pd.get_dummies( product.state_id.astype(str) + product.cat_id.astype(str), drop_first=False, ).astype("int8").values, pd.get_dummies( product.state_id.astype(str) + product.dept_id.astype(str), drop_first=False, ).astype("int8").values, pd.get_dummies( product.store_id.astype(str) + product.cat_id.astype(str), drop_first=False, ).astype("int8").values, pd.get_dummies( product.store_id.astype(str) + product.dept_id.astype(str), drop_first=False, ).astype("int8").values, pd.get_dummies(product.item_id.astype(str), drop_first=False ).astype("int8").values, pd.get_dummies( product.state_id.astype(str) + product.item_id.astype(str), drop_first=False, ).astype("int8").values, np.identity(NUM_ITEMS).astype(np.int8), # item :level 12 ].T self.weight_mat_csr = sparse.csr_matrix(weight_mat) sparse.save_npz("evaluator_weight_mat_csr", self.weight_mat_csr) del weight_mat self.weight1, self.weight2 = weight_calc(self.weight_mat_csr, self.data, product) np.save("evaluator_weight1", self.weight1) np.save("evaluator_weight2", self.weight2)
# =============== # Main # =============== s = time.time() with timer('load data', logging): with open('./data/else/col2path.pkl', 'rb') as f: col2path = pickle.load(f) X_train_all, X_test = load_datasets(FEATURES, col2path) y_train_all = load_target(TARGET_NAME) logging.debug(f'feature num: {len(X_train_all.columns)}') with timer('reduce_mem_usage', logging): if REDUCE: X_train_all = reduce_mem_usage(X_train_all) X_test = reduce_mem_usage(X_test) with timer(f'load {FOLD_PATH.split("/")[-1]}', logging): folds = pd.read_feather(FOLD_PATH) n_splits = folds['fold_id'].max() + 1 # with timer('concat oof', logging): # path = './features/lgbm_{data}.feather' # lgbm_pred_train = pd.read_feather(path.format(data='train')) # lgbm_pred_test = pd.read_feather(path.format(data='test')) # with timer('concat diff', logging): # path = './features/lgbm_diff_{data}.feather' # lgbm_pred_train = pd.read_feather(path.format(data='train')) # lgbm_pred_test = pd.read_feather(path.format(data='test'))
def run(self): if self.nrows == 0: self.nrows = None sales_train_validation = pd.read_csv( "../input/m5-forecasting-accuracy/sales_train_validation.csv", nrows=self.nrows, ) # christmasの日は外れ値なので落とす print("クリスマスはおとすぜ") calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", usecols=["date", "d"]) christmas_day = calendar.loc[calendar["date"].str.contains("12-25"), "d"].tolist() sales_train_validation = sales_train_validation.drop( columns=christmas_day) del calendar, christmas_day sales_train_validation = pd.melt( sales_train_validation, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="day", value_name="sales", ) print( "Melted sales train validation has {} rows and {} columns".format( sales_train_validation.shape[0], sales_train_validation.shape[1])) sales_train_validation = reduce_mem_usage(sales_train_validation) submission = pd.read_csv( "../input/m5-forecasting-accuracy/sample_submission.csv") # seperate test dataframes test1_rows = [row for row in submission["id"] if "validation" in row] test2_rows = [row for row in submission["id"] if "evaluation" in row] test1 = submission[submission["id"].isin(test1_rows)] test2 = submission[submission["id"].isin(test2_rows)] # change column names test1.columns = [ "id", "d_1914", "d_1915", "d_1916", "d_1917", "d_1918", "d_1919", "d_1920", "d_1921", "d_1922", "d_1923", "d_1924", "d_1925", "d_1926", "d_1927", "d_1928", "d_1929", "d_1930", "d_1931", "d_1932", "d_1933", "d_1934", "d_1935", "d_1936", "d_1937", "d_1938", "d_1939", "d_1940", "d_1941", ] test2.columns = [ "id", "d_1942", "d_1943", "d_1944", "d_1945", "d_1946", "d_1947", "d_1948", "d_1949", "d_1950", "d_1951", "d_1952", "d_1953", "d_1954", "d_1955", "d_1956", "d_1957", "d_1958", "d_1959", "d_1960", "d_1961", "d_1962", "d_1963", "d_1964", "d_1965", "d_1966", "d_1967", "d_1968", "d_1969", ] product = sales_train_validation[[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ]].drop_duplicates() # merge with product table test2["id"] = test2["id"].str.replace("_evaluation", "_validation") test1 = test1.merge(product, how="left", on="id") test2 = test2.merge(product, how="left", on="id") test2["id"] = test2["id"].str.replace("_validation", "_evaluation") # test1 = pd.melt( test1, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="day", value_name="sales", ) test2 = pd.melt( test2, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="day", value_name="sales", ) sales_train_validation["part"] = "train" test1["part"] = "test1" test2["part"] = "test2" data = pd.concat([sales_train_validation, test1, test2], axis=0) del sales_train_validation, test1, test2 data = data.loc[40500000:] # drop some calendar features calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv") calendar = reduce_mem_usage(calendar) calendar.drop(["weekday", "wday", "month", "year"], inplace=True, axis=1) # delete test2 for now data = data[data["part"] != "test2"] data = pd.merge(data, calendar, how="left", left_on=["day"], right_on=["d"]) data.drop(["day"], inplace=True, axis=1) data["d"] = data["d"].map(lambda x: int(x.split("_")[1])) # get the sell price data (this feature should be very important) sell_prices = pd.read_csv( "../input/m5-forecasting-accuracy/sell_prices.csv") sell_prices = reduce_mem_usage(sell_prices) data = data.merge(sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left") print("Our final dataset to train has {} rows and {} columns".format( data.shape[0], data.shape[1])) self.dump(data)