def run(self):
        config = Config()

        run_name = get_run_name()

        splits: List[Split] = self.load("splits")

        splits = delete_unused_features(splits)
        if config.DROP_OUTLIERS:
            splits = drop_outliers(splits)

        experiment_id = start_mlflow("cv_cls")
        mlflow.start_run(experiment_id=experiment_id, run_name=run_name)
        timestamp = mlflow.active_run().info.start_time / 1000
        start_time = datetime.datetime.fromtimestamp(timestamp).strftime(
            "%Y-%m-%d_%H:%M:%S")

        log_params()

        for cv_num, sp in enumerate(splits):
            file_dir = f"./output/cv_cls/{start_time}/{cv_num}"
            Path(file_dir).mkdir(parents=True, exist_ok=True)

            train_set, val_set = convert_to_lgb_dataset(sp, cv_num)
            model = train_cls(cv_num, config.lgbm_cls_params, train_set,
                              [val_set], 10, 20)
            df_val = predict_cls(sp, cv_num, model, val_set)

            pickle.dump(model, open(f"{file_dir}/model.pkl", "wb"))
            pickle.dump(
                df_val,
                open(f"{file_dir}/df_val.pkl", "wb"),
            )
        mlflow.end_run()
Пример #2
0
def log_metrics(
    cv_num: int,
    start_time: str,
    raw: RawData,
    test_pred: pd.DataFrame,
    test_true: pd.DataFrame,
) -> Tuple[float, float, float]:
    config = Config()
    d_start = config.CV_START_DAYS[cv_num]
    d_end = config.CV_START_DAYS[cv_num] + 28

    cv_result = CVResult(
        cv_num=cv_num,
        config=config,
        test_pred=test_pred[(test_pred.d >= d_start) & (test_pred.d < d_end)],
    )
    evaluator = cv_result.get_evaluator(raw)
    cv_result.create_dashboard(raw, f"./output/cv/{start_time}/{cv_num}")
    y_pred = test_pred[(test_pred.d >= d_start) & (test_pred.d < d_end)][config.TARGET]
    y_true = test_true[(test_true.d >= d_start) & (test_true.d < d_end)][config.TARGET]

    wrmsse = np.mean(evaluator.all_scores)
    rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
    mae = sklearn.metrics.mean_absolute_error(y_true, y_pred)

    print(f"==========CV No: {cv_num}=================")
    print("WRMSSE", wrmsse)
    print("RMSE", rmse)
    print("MAE", mae)
    print("=================================")
    mlflow.log_metric(f"WRMSSE_{cv_num}", wrmsse)
    mlflow.log_metric(f"RMSE_{cv_num}", rmse)
    mlflow.log_metric(f"MAE_{cv_num}", mae)
    return wrmsse, rmse, mae
Пример #3
0
 def run(self):
     config = Config()
     data: pd.DataFrame = self.load()
     train_df: pd.DataFrame = data[(data.d > config.START_DAY)
                                   & (data.d <= 1913)]
     result: List[Tuple[List[str],
                        pd.DataFrame]] = target_encoding_catch22(train_df)
     self.dump(result)
Пример #4
0
 def run(self):
     config = Config()
     data: pd.DataFrame = self.load("data")
     results: List[List[Tuple[List[str], pd.DataFrame]]] = []
     for end_day in config.CV_START_DAYS:
         train_df: pd.DataFrame = data[(data.d > config.START_DAY)
                                       & (data.d < end_day)]
         results.append(target_encoding_catch22(train_df))
     self.dump(results)
Пример #5
0
def predict_cls(
    sp: Split, cv_num: int, model: LGBMClassifier, val_set: lgb.Dataset
) -> pd.DataFrame:
    config = Config()
    df_val = sp.test[
        (sp.test.d >= config.CV_START_DAYS[cv_num])
        & (sp.test.d < config.CV_START_DAYS[cv_num] + 28)
    ][["id", "d", "sales_is_zero"]]
    df_val["sales_is_zero_pred"] = model.predict_proba(val_set.data)[:, 1]
    return df_val
Пример #6
0
def delete_unused_features(splits: List[Split]) -> List[Split]:
    config = Config()
    for i in range(len(splits)):
        splits[i].train = splits[i].train[["id", "d", config.TARGET] + config.features]
        splits[i].test = splits[i].test[["id", "d", config.TARGET] + config.features]
        splits[i].train = splits[i].train[splits[i].train["d"] >= config.START_DAY]
        print(f"CV{i} train shape:", splits[i].train.shape)
        if config.DROP_NA:
            splits[i].train = splits[i].train.dropna()
            print(f"CV{i} NA dropped train shape:", splits[i].train.shape)
    return splits
Пример #7
0
def cls_postprocessing(cv_num: int, test_pred: pd.DataFrame) -> pd.DataFrame:
    with timer("cls_postprocessing"):
        config = Config()
        df_val: pd.dataframe = pickle.load(
            open(f"./output/cv_cls/{config.CLS_TIMESTAMP}/0/df_val.pkl", "rb")
        )
        test_pred["tmp_id"] = (
            test_pred["id"].astype(str) + "_" + test_pred["d"].astype(str)
        )
        df_val = df_val[df_val["sales_is_zero_pred"] >= config.CLS_THRESHOLD]
        tmp_ids = df_val["id"].astype(str) + "_" + df_val["d"].astype(str)
        test_pred.loc[test_pred["tmp_id"].isin(tmp_ids), "sales"] = 0
        test_pred.drop(["tmp_id"], axis=1, inplace=True)
    return test_pred
Пример #8
0
def convert_to_lgb_dataset(sp: Split, cv_num: int) -> Tuple[lgb.Dataset, lgb.Dataset]:
    config = Config()
    train_set = lgb.Dataset(sp.train[config.features], sp.train[config.TARGET])
    val_set = lgb.Dataset(
        sp.test[
            (sp.test.d >= config.CV_START_DAYS[cv_num])
            & (sp.test.d < config.CV_START_DAYS[cv_num] + 28)
        ][config.features],
        sp.test[
            (sp.test.d >= config.CV_START_DAYS[cv_num])
            & (sp.test.d < config.CV_START_DAYS[cv_num] + 28)
        ][config.TARGET],
    )
    return train_set, val_set
Пример #9
0
    def run(self):
        data: pd.DataFrame = self.load()

        sp_idx: SplitIndex = SplitIndex()

        config = Config()
        sp_idx.train = list(data[(data.d >= config.START_DAY)
                                 & (data.d <= 1913)].index)
        sp_idx.test = list(data[(data.d > 1913 - config.MAX_LAGS)].index)

        print("train shape:", data.iloc[sp_idx.train, :].shape)
        print("test shape:", data.iloc[sp_idx.test, :].shape)

        self.dump(sp_idx)
Пример #10
0
def log_params():
    config = Config()
    mlflow.lightgbm.autolog()
    mlflow.log_param("MIN_SUM", config.MIN_SUM)
    mlflow.log_param("MAX_LAGS", config.MAX_LAGS)
    mlflow.log_param("start_day", config.START_DAY)
    mlflow.log_param("SEED", config.SEED)
    mlflow.log_param("DROP_NA", config.DROP_NA)
    mlflow.log_param("DROP_OUTLIERS", config.DROP_OUTLIERS)
    mlflow.log_param("CV_SAMPLE_RATE", config.CV_SAMPLE_RATE)
    mlflow.log_param("MODEL", config.MODEL)
    mlflow.log_param("CLS_POSTPROCESSING", config.CLS_POSTPROCESSING)
    mlflow.log_param("CLS_TIMESTAMP", config.CLS_TIMESTAMP)
    mlflow.log_param("CLS_THRESHOLD", config.CLS_THRESHOLD)
    mlflow.log_param("features", ",\n".join([f"'{f}'" for f in config.features]))
Пример #11
0
 def run(self):
     config = Config()
     data: pd.DataFrame = pd.concat(
         [self.load("data"), self.load("fe_event")], axis=1)
     train_df: pd.DataFrame = data[(data.d > config.START_DAY)
                                   & (data.d <= 1913)]
     # train_df = train_df.sample(int(len(train_df) * 0.15))
     with timer("create grouped df"):
         grouped: List[Tuple[List[str],
                             pd.DataFrame]] = target_encoding(train_df)
     with timer("merge into data"):
         for group_key, grouped_df in tqdm(grouped):
             data = data.merge(grouped_df, on=group_key, how="left")
         df = reduce_mem_usage(data.filter(like="fe_te_"))
         print(df.info())
     self.dump(df)
Пример #12
0
    def run(self):
        data: pd.DataFrame = self.load()
        sp_idxs: List[SplitIndex] = []
        config = Config()
        np.random.seed(config.SEED)
        random.seed(config.SEED)

        for cv_start_day in tqdm(config.CV_START_DAYS):
            sp_idx: SplitIndex = SplitIndex()
            train_df = data[data.d < cv_start_day]
            sp_idx.train = list(
                # train_df.sample(int(len(data) * config.CV_SAMPLE_RATE)).index
                train_df.index)
            sp_idx.test = list(data[(data.d >= cv_start_day - config.MAX_LAGS)
                                    & (data.d < cv_start_day + 28)].index)
            sp_idxs.append(sp_idx)

        self.dump(sp_idxs)
 def run(self):
     with timer("combine val features"):
         with timer("concat features"):
             data: pd.DataFrame = pd.concat(
                 [
                     self.load("data"),
                     self.load("fe_price_rolling"),
                     self.load("fe_price_change"),
                     self.load("fe_price_basic"),
                     self.load("fe_shift"),
                     self.load("fe_rolling_mean"),
                     self.load("fe_rolling_dw_mean"),
                     self.load("fe_rolling_group_mean"),
                     self.load("fe_rolling_group_std"),
                     self.load("fe_rolling_std"),
                     self.load("fe_rolling_skew"),
                     self.load("fe_rolling_kurt"),
                     self.load("fe_weather"),
                     self.load("fe_unemployment"),
                     self.load("fe_stock"),
                     self.load("fe_event"),
                     self.load("fe_event_strength"),
                     self.load("fe_catch22_pca"),
                 ],
                 axis=1,
             )
         with timer("merge target features"):
             config = Config()
             te_val_data: List[pd.DataFrame] = self.load("te_val_data")
             splits: List[Split] = []
             sp_idxs: List[SplitIndex] = self.load("sp_idxs")
             for i in tqdm(range(len(sp_idxs))):
                 sp: Split = Split()
                 data = pd.concat([data, te_val_data[i]], axis=1)
                 sp.train = data.iloc[sp_idxs[i].train, :]
                 sp.test = data.iloc[sp_idxs[i].test, :]
                 if config.CV_SAMPLE_RATE != 1:
                     sp.train = sp.train.sample(
                         int(len(sp.train) * config.CV_SAMPLE_RATE))
                 splits.append(sp)
                 print(sp.train.info())
                 data = data.drop(list(data.filter(like="fe_te_").columns),
                                  axis=1)
     self.dump(splits)
Пример #14
0
def predict(
    cv_num: int, sp: Split, model: lgb.Booster, model_number: Optional[int] = None
) -> pd.DataFrame:
    config = Config()
    d_start: int = config.CV_START_DAYS[cv_num]
    d_end: int = config.CV_START_DAYS[cv_num] + 28
    test_pred = sp.test.copy()
    test_pred[config.TARGET + "_true"] = test_pred[config.TARGET]

    test_pred.loc[test_pred.d >= d_start, config.TARGET] = np.nan
    for d in tqdm(range(d_start, d_end)):
        test_pred = make_rolling_for_test(test_pred, d, config.features)
        test_pred.loc[test_pred.d == d, config.TARGET] = model.predict(
            test_pred.loc[test_pred.d == d, config.features]
        )
        test_pred.loc[test_pred.d == d, "sales_is_zero"] = (
            test_pred.loc[test_pred.d == d, "sales"] == 0
        ).astype(np.int8)

    return test_pred
Пример #15
0
 def run(self):
     config = Config()
     data: pd.DataFrame = pd.concat(
         [self.load("data"), self.load("fe_event")], axis=1)
     dfs: List[pd.DataFrame] = []
     for end_day in config.CV_START_DAYS:
         with timer("create grouped df"):
             # train_df: pd.DataFrame = data[
             #     (data.d > config.START_DAY) & (data.d < end_day)
             # ]
             train_df: pd.DataFrame = data[data.d < end_day]
             grouped: List[Tuple[List[str],
                                 pd.DataFrame]] = target_encoding(train_df)
         with timer("merge into data"):
             df = data.copy()
             for group_key, grouped_df in tqdm(grouped):
                 df = df.merge(grouped_df, on=group_key, how="left")
             df = reduce_mem_usage(df.filter(like="fe_te_"))
             print(df.info())
             dfs.append(df)
     self.dump(dfs)
Пример #16
0
def train(
    cv_num: int,
    params: Dict[str, Any],
    train_set: lgb.Dataset,
    valid_sets: List[lgb.Dataset],
    verbose_eval: int,
    early_stopping_rounds: Optional[int] = None,
    model_number: Optional[int] = None,
) -> lgb.Booster:
    config = Config()
    timer_name: str = f"train CV_{cv_num}"
    if model_number:
        timer_name += f"_{model_number}"
    with timer(timer_name, mlflow_on=True):
        model = lgb.train(
            params,
            train_set,
            num_boost_round=config.num_boost_round,
            verbose_eval=verbose_eval,
            # early_stopping_rounds=early_stopping_rounds,
            valid_sets=valid_sets,
        )
    return model
Пример #17
0
def log_result(cv_num: int, start_time: str, test_pred: pd.DataFrame):
    config = Config()
    d_start = config.CV_START_DAYS[cv_num]
    d_end = config.CV_START_DAYS[cv_num] + 28
    save_cols: List[str] = [
        "id",
        "item_id",
        "dept_id",
        "cat_id",
        "store_id",
        "state_id",
        "d",
        config.TARGET,
        config.TARGET + "_true",
    ]
    pickle.dump(
        test_pred.loc[(test_pred.d >= d_start) & (test_pred.d < d_end), save_cols],
        open(f"./output/cv/{start_time}/{cv_num}/test_pred.pkl", "wb"),
    )
    pickle.dump(
        config, open(f"./output/cv/{start_time}/{cv_num}/config.pkl", "wb"),
    )
    test_pred
Пример #18
0
def train_cls(
    cv_num: int,
    params: Dict[str, Any],
    train_set: lgb.Dataset,
    valid_sets: List[lgb.Dataset],
    verbose_eval: int,
    early_stopping_rounds: Optional[int] = None,
    model_number: Optional[int] = None,
) -> LGBMClassifier:
    config = Config()
    timer_name: str = f"train CV_{cv_num}"
    if model_number:
        timer_name += f"_{model_number}"
    with timer(timer_name, mlflow_on=True):
        model = LGBMClassifier(**config.lgbm_cls_params)
        model.fit(
            train_set.data,
            train_set.label,
            categorical_feature=config.lgbm_cat_features,
            eval_set=[(dataset.data, dataset.label) for dataset in valid_sets],
            eval_metric="logloss,auc,cross_entropy",
            verbose=10,
        )
    return model
Пример #19
0
def partial_train_and_predict(
    sp: Split,
    ids: pd.Series,
    cv_num: int,
    model_number: int,
    objective: Optional[str] = None,
    SEED: Optional[int] = None,
) -> pd.DataFrame:
    config = Config()
    sp_part: Split = Split()
    sp_part.train = sp.train[sp.train["id"].isin(ids)]
    sp_part.test = sp.test[sp.test["id"].isin(ids)]
    train_set, val_set = convert_to_lgb_dataset(sp_part, cv_num)
    params = config.lgbm_params
    if objective:
        params["objective"] = objective
        params.pop("tweedie_variance_power", None)
    if SEED:
        params["seed"] = SEED
    model = train(
        cv_num, params, train_set, [train_set], 10, 20, model_number=model_number,
    )
    test_pred = predict(cv_num, sp_part, model)
    return test_pred
Пример #20
0
    def run(self):
        config = Config()

        run_name = get_run_name()

        splits: List[Split] = self.load("splits")
        raw: RawData = self.load("raw")

        splits = delete_unused_features(splits)
        if config.DROP_OUTLIERS:
            splits = drop_outliers(splits)

        print_nan_ratio(splits)

        for SEED in range(1, 1000, 10):

            run_name = "seed = {}".format(SEED)

            experiment_id = start_mlflow()
            mlflow.start_run(experiment_id=experiment_id, run_name=run_name)
            timestamp = mlflow.active_run().info.start_time / 1000
            start_time = datetime.datetime.fromtimestamp(timestamp).strftime(
                "%Y-%m-%d_%H:%M:%S")

            log_params()

            # wrmsses, rmss, maes = [], [], []
            for cv_num, sp in enumerate(splits):
                Path(f"./output/cv/{start_time}/{cv_num}").mkdir(parents=True,
                                                                 exist_ok=True)

                test_pred: pd.DataFrame = pd.DataFrame()
                if config.MODEL == "zero":
                    test_pred = train_by_zero(raw, sp, cv_num)
                elif config.MODEL == "store":
                    test_pred = train_by_store(raw, sp, cv_num, SEED)
                elif config.MODEL == "store":
                    test_pred = train_by_store(raw, sp, cv_num)
                elif config.MODEL == "cat":
                    test_pred = train_by_cat(raw, sp, cv_num)
                elif config.MODEL == "dept":
                    test_pred = train_by_dept(raw, sp, cv_num)
                elif config.MODEL == "normal":
                    train_set, val_set = convert_to_lgb_dataset(sp, cv_num)
                    model = train(
                        cv_num,
                        config.lgbm_params,
                        train_set,
                        [train_set],
                        verbose_eval=10,
                        early_stopping_rounds=20,
                    )
                    test_pred = predict(cv_num, sp, model)

                if config.CLS_POSTPROCESSING:
                    wrmsse, rmse, mae = log_metrics(cv_num, start_time, raw,
                                                    test_pred, sp.test)
                    test_pred = cls_postprocessing(cv_num, test_pred)
                log_result(cv_num, start_time, test_pred)
                # wrmsse, rmse, mae = log_metrics(cv_num, start_time, raw, test_pred, sp.test)
                # wrmsses.append(wrmsse)
                # rmses.append(rmse)
                # maes.append(mae)

            # log_avg_metrics(wrmsses, rmses, maes)
            mlflow.end_run()
            time.sleep(10)