def get_predictions(df: pd.DataFrame, model: lgbm.LGBMRanker): """ :param df: :return: """ print(f'\tPredicting relevance') test_pred = model.predict(df) df['relevance'] = test_pred df.sort_values(by=['srch_id', 'relevance'], ascending=[True, False], inplace=True) kaggle_answer = pd.DataFrame({ 'srch_id': df['srch_id'], 'prop_id': df['prop_id'] }) print(f'\t Writing answers to csv') kaggle_answer.to_csv('expedia_answer.csv', index=False)
# Convert to numpy groups = np.array(groups) y = np.array(y) X = np.array(X) unique_groups = np.unique(groups) # Rank data ranked_y = np.zeros_like(y) for g in unique_groups: indices = groups == g ranks = rankdata(y[indices]) ranked_y[indices] = np.array(ranks / np.max(ranks) * 1000).astype(int) # Ranker ranker = LGBMRanker(n_estimators=500, learning_rate=0.05, num_leaves=16, label_gain=np.arange(0, 1001, 1)) logo = LeaveOneGroupOut() correlations = [] for train_index, test_index in tqdm(logo.split(X, y, groups)): unique, counts = np.unique(groups[train_index], return_counts=True) ranker.fit(X[train_index], ranked_y[train_index], group=counts) predictions = ranker.predict(X[test_index]) correlation, p_value = kendalltau(ranked_y[test_index], predictions) print(np.unique(groups[test_index]), correlation) correlations.append(correlation) print("Mean correlation: ", np.mean(correlations))
train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0][:split_idx] # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] val_ind = np.arange(split_idx, 4868466) print("train_ind: {} / val_ind: {}".format(train_ind, val_ind)) logger.info( f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min():(train_ind.max() + 1)] X_val = mat[val_ind.min():(val_ind.max() + 1)] del mat gc.collect() with timer("model fitting"): model = LGBMRanker(**BEST_PARAMS) model.fit(X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)) val_pred = model.predict(X_val) train_pred = model.predict(X_train) logger.info("Train AUC {:.4f}".format( roc_auc_score(meta_train["was_clicked"].values, train_pred))) logger.info("Val AUC {:.4f}".format( roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) githash = get_git_hash() meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False) joblib.dump(model, "model_val.joblib")
def mrr_metric(train_data, preds): mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values) return "error", mrr, True model = LGBMRanker(learning_rate=0.05, n_estimators=900, min_child_samples=5, min_child_weight=0.00001, n_jobs=-2) model.fit( mat_train, df_train["was_clicked"], group=group_lengths(df_train["clickout_id"]), # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1), verbose=True, eval_set=[(mat_val, df_val["was_clicked"])], eval_group=[group_lengths(df_val["clickout_id"])], eval_metric=mrr_metric, ) df_train["click_proba"] = model.predict(mat_train) df_val["click_proba"] = model.predict(mat_val) print(mrr_fast(df_val, "click_proba")) print("By rank") for n in range(1, 10): print(n, mrr_fast(df_val[df_val["clickout_step_rev"] == n], "click_proba"))