Exemplo n.º 1
0
def main():
    # Evaluate model performance
    # Get the "ideal" order of y_test by sorting in descending order.

    args = parse.get_test_args()

    X_train, X_test, X_val, y_train, y_test, y_val, group_vali, group_train = get_data(
        args["data_path"])

    gbm = lgb.Booster(model_file=args["model_path"])

    true_relevance = y_test.sort_values(ascending=False)

    # Get the actual order of y_test by sorting it according to our model's predictions.

    test_pred = gbm.predict(X_test)
    y_test = pd.DataFrame({
        "relevance_score": y_test,
        "predicted_ranking": test_pred
    })

    relevance_score = y_test.sort_values("predicted_ranking", ascending=False)

    # Use computed variables to calculate the nDCG score
    print(
        "nDCG score: ",
        ndcg_score([true_relevance.to_numpy()],
                   [relevance_score["relevance_score"].to_numpy()]),
    )
Exemplo n.º 2
0
def run(cfg: DictConfig, ):
    cwd = hydra.utils.get_original_cwd()

    # lag_win_pairs = [
    #         (28, 28),
    #         (28, 7),
    #         (7,7),
    #         (7, 3),
    #         (1, 3),
    #     ]
    lag_win_pairs = cfg.feat.lag_win_pairs

    writer.log_params_from_omegaconf_dict(cfg)
    print(os.getcwd())
    writer.log_artifact(os.path.join(os.getcwd(), '.hydra/config.yaml'))

    # ----------------------------------------------------------------------
    # load data
    # ----------------------------------------------------------------------
    PATH_PRICE_CSV = os.path.join(
        cwd, "../input/m5-forecasting-accuracy/sell_prices.csv")
    PATH_CALENDER_CSV = os.path.join(
        cwd, "../input/m5-forecasting-accuracy/calendar.csv")
    # PATH_SALES_CSV = os.path.join(cwd, "../input/m5-forecasting-accuracy/sales_train_validation.csv")
    PATH_SALES_CSV = os.path.join(
        cwd, "../input/m5-forecasting-accuracy/sales_train_evaluation.csv")
    PATH_SAMPLE_SUB_CSV = os.path.join(
        cwd, "../input/m5-forecasting-accuracy/sample_submission.csv")

    load_start_time = time.time()
    if cfg.data.path_basic_df == "":
        print("making basic df from scratch...")

        df = load_data.create_dt(
            PATH_PRICE_CSV,
            PATH_CALENDER_CSV,
            PATH_SALES_CSV,
            first_day=1500,
        )
        df = load_data.reduce_mem_usage(df)
    else:
        print("loading saved basic csv...")
        path_df = cfg.data.path_basic_df
        df = pd.read_csv(os.path.join(cwd, path_df), index_col=0)
        df = load_data.reduce_mem_usage(df)
        print(df.shape)

    print(f"data loading time:{(time.time() - load_start_time)//60} min.")

    df = df.query("date > @dev_firstdate")
    gc.collect()

    # ----------------------------------------------------------------------
    # feature engineering
    # ----------------------------------------------------------------------

    # max_lags = max(list(map(lambda x:x[0], lag_win_pairs))) * 2
    max_lags = 56 + 29
    create_feature.create_fea(df, lag_win_pairs=lag_win_pairs)

    # ----------------------------------------------------------------------
    # Train model
    # ----------------------------------------------------------------------
    if cfg.lgbm.pretrained:
        m_lgb = lgb.Booster(
            model_file=os.path.join(cwd, cfg.lgbm.pretrained_model_path))

    else:
        model, _, train_cols = train_lgbm(df, cfg)
        model_savepath = os.path.join(
            cwd, f"../result/{writer.experiment_name}_{writer.run_id}.model")
        model.save_model(model_savepath)
        writer.log_param("model_path", model_savepath)

    # ----------------------------------------------------------------------
    # Inference on public lb
    # ----------------------------------------------------------------------
    public_pred_df = make_lb_predictions(model,
                                         public_firstdate,
                                         train_cols,
                                         max_lags,
                                         test_data_path=cfg.data.path_test_df,
                                         lag_win_pairs=lag_win_pairs)
    public_pred_wrmsse = eval_metrics.get_public_score(public_pred_df,
                                                       PATH_SALES_CSV,
                                                       PATH_PRICE_CSV,
                                                       PATH_CALENDER_CSV,
                                                       PATH_SAMPLE_SUB_CSV)
    print(f"Public score: {public_pred_wrmsse}")
    writer.log_metric("public_WRMSSE", public_pred_wrmsse)

    # ----------------------------------------------------------------------
    # Inference on private lb
    # ----------------------------------------------------------------------
    pass
parser = argparse.ArgumentParser(description='Process input')
parser.add_argument('tsv_path', type=str, help='tsv file path')
args = parser.parse_args()

# Reading input TSV
data_raw = pd.read_csv(args.tsv_path, sep="\t")

#####
with open('preprocessor.pkl', 'rb') as input:
    prep = pickle.load(input)
data = prep.transform(data_raw)

# logtarget = np.log1p(data.revenue)
# data = data.drop(['revenue'],axis = 1)

bst_lgbm = lgb.Booster(model_file='lgbm_model.txt')  # init model
preds_lgbm = bst_lgbm.predict(data)

bst_xgb = xgb.Booster()
bst_xgb.load_model("xgb_model.txt")
dtest = xgb.DMatrix(data)
preds_xgb = bst_xgb.predict(dtest)

preds = (preds_lgbm + preds_xgb) / 2
preds = np.expm1(preds)

prediction_df = pd.DataFrame(columns=['id', 'revenue'])
prediction_df['id'] = data_raw['id']
prediction_df['revenue'] = preds

prediction_df.to_csv("prediction.csv", index=False, header=False)