Пример #1
0
def get_predictions(df: pd.DataFrame, model: lgbm.LGBMRanker):
    """

    :param df:
    :return:
    """

    print(f'\tPredicting relevance')
    test_pred = model.predict(df)
    df['relevance'] = test_pred
    df.sort_values(by=['srch_id', 'relevance'],
                   ascending=[True, False],
                   inplace=True)
    kaggle_answer = pd.DataFrame({
        'srch_id': df['srch_id'],
        'prop_id': df['prop_id']
    })
    print(f'\t Writing answers to csv')
    kaggle_answer.to_csv('expedia_answer.csv', index=False)
Пример #2
0
    # Convert to numpy
    groups = np.array(groups)
    y = np.array(y)
    X = np.array(X)
    unique_groups = np.unique(groups)

    # Rank data
    ranked_y = np.zeros_like(y)
    for g in unique_groups:
        indices = groups == g
        ranks = rankdata(y[indices])
        ranked_y[indices] = np.array(ranks / np.max(ranks) * 1000).astype(int)

    # Ranker
    ranker = LGBMRanker(n_estimators=500,
                        learning_rate=0.05,
                        num_leaves=16,
                        label_gain=np.arange(0, 1001, 1))

    logo = LeaveOneGroupOut()

    correlations = []
    for train_index, test_index in tqdm(logo.split(X, y, groups)):
        unique, counts = np.unique(groups[train_index], return_counts=True)
        ranker.fit(X[train_index], ranked_y[train_index], group=counts)
        predictions = ranker.predict(X[test_index])
        correlation, p_value = kendalltau(ranked_y[test_index], predictions)
        print(np.unique(groups[test_index]), correlation)
        correlations.append(correlation)
    print("Mean correlation: ", np.mean(correlations))
    train_ind = np.where((meta.is_val == 0)
                         & (meta.is_test == 0))[0][:split_idx]
    # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
    val_ind = np.arange(split_idx, 4868466)
    print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))
    logger.info(
        f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
    meta_train = meta.iloc[train_ind]
    meta_val = meta.iloc[val_ind]
    X_train = mat[train_ind.min():(train_ind.max() + 1)]
    X_val = mat[val_ind.min():(val_ind.max() + 1)]
    del mat
    gc.collect()

with timer("model fitting"):
    model = LGBMRanker(**BEST_PARAMS)
    model.fit(X_train,
              meta_train["was_clicked"].values,
              group=group_lengths(meta_train["clickout_id"].values))
    val_pred = model.predict(X_val)
    train_pred = model.predict(X_train)
    logger.info("Train AUC {:.4f}".format(
        roc_auc_score(meta_train["was_clicked"].values, train_pred)))
    logger.info("Val AUC {:.4f}".format(
        roc_auc_score(meta_val["was_clicked"].values, val_pred)))
    meta_val["click_proba"] = val_pred
    logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
    githash = get_git_hash()
    meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False)
    joblib.dump(model, "model_val.joblib")
Пример #4
0

def mrr_metric(train_data, preds):
    mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values)
    return "error", mrr, True


model = LGBMRanker(learning_rate=0.05,
                   n_estimators=900,
                   min_child_samples=5,
                   min_child_weight=0.00001,
                   n_jobs=-2)
model.fit(
    mat_train,
    df_train["was_clicked"],
    group=group_lengths(df_train["clickout_id"]),
    # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1),
    verbose=True,
    eval_set=[(mat_val, df_val["was_clicked"])],
    eval_group=[group_lengths(df_val["clickout_id"])],
    eval_metric=mrr_metric,
)

df_train["click_proba"] = model.predict(mat_train)
df_val["click_proba"] = model.predict(mat_val)

print(mrr_fast(df_val, "click_proba"))
print("By rank")
for n in range(1, 10):
    print(n, mrr_fast(df_val[df_val["clickout_step_rev"] == n], "click_proba"))