def main(): # Evaluate model performance # Get the "ideal" order of y_test by sorting in descending order. args = parse.get_test_args() X_train, X_test, X_val, y_train, y_test, y_val, group_vali, group_train = get_data( args["data_path"]) gbm = lgb.Booster(model_file=args["model_path"]) true_relevance = y_test.sort_values(ascending=False) # Get the actual order of y_test by sorting it according to our model's predictions. test_pred = gbm.predict(X_test) y_test = pd.DataFrame({ "relevance_score": y_test, "predicted_ranking": test_pred }) relevance_score = y_test.sort_values("predicted_ranking", ascending=False) # Use computed variables to calculate the nDCG score print( "nDCG score: ", ndcg_score([true_relevance.to_numpy()], [relevance_score["relevance_score"].to_numpy()]), )
def run(cfg: DictConfig, ): cwd = hydra.utils.get_original_cwd() # lag_win_pairs = [ # (28, 28), # (28, 7), # (7,7), # (7, 3), # (1, 3), # ] lag_win_pairs = cfg.feat.lag_win_pairs writer.log_params_from_omegaconf_dict(cfg) print(os.getcwd()) writer.log_artifact(os.path.join(os.getcwd(), '.hydra/config.yaml')) # ---------------------------------------------------------------------- # load data # ---------------------------------------------------------------------- PATH_PRICE_CSV = os.path.join( cwd, "../input/m5-forecasting-accuracy/sell_prices.csv") PATH_CALENDER_CSV = os.path.join( cwd, "../input/m5-forecasting-accuracy/calendar.csv") # PATH_SALES_CSV = os.path.join(cwd, "../input/m5-forecasting-accuracy/sales_train_validation.csv") PATH_SALES_CSV = os.path.join( cwd, "../input/m5-forecasting-accuracy/sales_train_evaluation.csv") PATH_SAMPLE_SUB_CSV = os.path.join( cwd, "../input/m5-forecasting-accuracy/sample_submission.csv") load_start_time = time.time() if cfg.data.path_basic_df == "": print("making basic df from scratch...") df = load_data.create_dt( PATH_PRICE_CSV, PATH_CALENDER_CSV, PATH_SALES_CSV, first_day=1500, ) df = load_data.reduce_mem_usage(df) else: print("loading saved basic csv...") path_df = cfg.data.path_basic_df df = pd.read_csv(os.path.join(cwd, path_df), index_col=0) df = load_data.reduce_mem_usage(df) print(df.shape) print(f"data loading time:{(time.time() - load_start_time)//60} min.") df = df.query("date > @dev_firstdate") gc.collect() # ---------------------------------------------------------------------- # feature engineering # ---------------------------------------------------------------------- # max_lags = max(list(map(lambda x:x[0], lag_win_pairs))) * 2 max_lags = 56 + 29 create_feature.create_fea(df, lag_win_pairs=lag_win_pairs) # ---------------------------------------------------------------------- # Train model # ---------------------------------------------------------------------- if cfg.lgbm.pretrained: m_lgb = lgb.Booster( model_file=os.path.join(cwd, cfg.lgbm.pretrained_model_path)) else: model, _, train_cols = train_lgbm(df, cfg) model_savepath = os.path.join( cwd, f"../result/{writer.experiment_name}_{writer.run_id}.model") model.save_model(model_savepath) writer.log_param("model_path", model_savepath) # ---------------------------------------------------------------------- # Inference on public lb # ---------------------------------------------------------------------- public_pred_df = make_lb_predictions(model, public_firstdate, train_cols, max_lags, test_data_path=cfg.data.path_test_df, lag_win_pairs=lag_win_pairs) public_pred_wrmsse = eval_metrics.get_public_score(public_pred_df, PATH_SALES_CSV, PATH_PRICE_CSV, PATH_CALENDER_CSV, PATH_SAMPLE_SUB_CSV) print(f"Public score: {public_pred_wrmsse}") writer.log_metric("public_WRMSSE", public_pred_wrmsse) # ---------------------------------------------------------------------- # Inference on private lb # ---------------------------------------------------------------------- pass
parser = argparse.ArgumentParser(description='Process input') parser.add_argument('tsv_path', type=str, help='tsv file path') args = parser.parse_args() # Reading input TSV data_raw = pd.read_csv(args.tsv_path, sep="\t") ##### with open('preprocessor.pkl', 'rb') as input: prep = pickle.load(input) data = prep.transform(data_raw) # logtarget = np.log1p(data.revenue) # data = data.drop(['revenue'],axis = 1) bst_lgbm = lgb.Booster(model_file='lgbm_model.txt') # init model preds_lgbm = bst_lgbm.predict(data) bst_xgb = xgb.Booster() bst_xgb.load_model("xgb_model.txt") dtest = xgb.DMatrix(data) preds_xgb = bst_xgb.predict(dtest) preds = (preds_lgbm + preds_xgb) / 2 preds = np.expm1(preds) prediction_df = pd.DataFrame(columns=['id', 'revenue']) prediction_df['id'] = data_raw['id'] prediction_df['revenue'] = preds prediction_df.to_csv("prediction.csv", index=False, header=False)