Exemplo n.º 1
0
    def fit_and_predict(self, df_train, df_val, validate=False):
        with timer("vectorizing train"):
            mat_train = self.vectorizer.fit_transform(df_train)
            print("Train shape", mat_train.shape)
        with timer("vectorinzg val"):
            mat_val = self.vectorizer.transform(df_val)
            print("Val shape", mat_val.shape)

        with timer("fitting model"):
            if isinstance(self.model, LGBMRanker):
                self.model.fit(
                    mat_train, df_train["was_clicked"].values, group=group_lengths(df_train["clickout_id"].values)
                )
            else:
                self.model.fit(mat_train, df_train["was_clicked"].values)

        if self.is_prob:
            val_pred = self.model.predict_proba(mat_val)[:, 1]
            if validate:
                train_pred = self.model.predict_proba(mat_train)[:, 1]
                self.evaluate(df_train, df_val, train_pred, val_pred)
        else:
            print("Predicting validation")
            val_pred = self.model.predict(mat_val)
            if validate:
                print("Predicting train")
                train_pred = self.model.predict(mat_train)
                self.evaluate(df_train, df_val, train_pred, val_pred)
        self.save_predictions(df_val, val_pred, validate)
        return val_pred
Exemplo n.º 2
0
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger):
    with timer("read data"):
        meta = pd.read_hdf(meta_path, key="data")
        mat = h5sparse.File(mat_path, mode="r")["matrix"]

    with timer("split data"):
        if val:
            train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0]
            val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
        else:
            train_ind = np.where(meta.is_test == 0)[0]
            val_ind = np.where(meta.is_test == 1)[0]

        logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
        meta_train = meta.iloc[train_ind]
        meta_val = meta.iloc[val_ind]
        X_train = mat[train_ind.min() : (train_ind.max() + 1)]
        X_val = mat[val_ind.min() : (val_ind.max() + 1)]
        del mat
        gc.collect()

    with timer("fit model"):
        model_instance.fit(
            X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)
        )
        joblib.dump(model_instance, model_path)
        val_pred = model_instance.predict(X_val)
        train_pred = model_instance.predict(X_train)
        logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred)))
        if val:
            logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred)))
        meta_val["click_proba"] = val_pred
        if val:
            logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
        meta_val.to_csv(predictions_path, index=False)
    train_ind = np.where((meta.is_val == 0)
                         & (meta.is_test == 0))[0][:split_idx]
    # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
    val_ind = np.arange(split_idx, 4868466)
    print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))
    logger.info(
        f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
    meta_train = meta.iloc[train_ind]
    meta_val = meta.iloc[val_ind]
    X_train = mat[train_ind.min():(train_ind.max() + 1)]
    X_val = mat[val_ind.min():(val_ind.max() + 1)]
    del mat
    gc.collect()

with timer("model fitting"):
    model = LGBMRanker(**BEST_PARAMS)
    model.fit(X_train,
              meta_train["was_clicked"].values,
              group=group_lengths(meta_train["clickout_id"].values))
    val_pred = model.predict(X_val)
    train_pred = model.predict(X_train)
    logger.info("Train AUC {:.4f}".format(
        roc_auc_score(meta_train["was_clicked"].values, train_pred)))
    logger.info("Val AUC {:.4f}".format(
        roc_auc_score(meta_val["was_clicked"].values, val_pred)))
    meta_val["click_proba"] = val_pred
    logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
    githash = get_git_hash()
    meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False)
    joblib.dump(model, "model_val.joblib")
Exemplo n.º 4
0
    with Pool(32) as pool:
        val_predictions_dfs = pool.map(read_prediction_val, [fn for _, fn in preds_vals_all])
    val_predictions = [
        (mrr, hsh, df, config)
        for ((hsh, fn), (mrr, df, config)) in zip(preds_vals_all, val_predictions_dfs)
        if (df.shape[0] == 3_077_674) and (mrr > 0.68) and ("160357" not in fn) and ("59629" not in fn)
    ]
    val_hashes = [p[1] for p in val_predictions]

    print("Debuging click probas")
    for mrr, hsh, df, _ in val_predictions:
        print(mrr, hsh, df["click_proba"].min(), df["click_proba"].max())

    final = val_predictions[-1][2].copy()

    lengths = group_lengths(final["clickout_id"])
    preds_stack = np.vstack([df["click_proba"] for _, _, df, _ in val_predictions]).T

    def opt(v):
        preds_ens = preds_stack.dot(v)
        mrr = mrr_fast_v3(final["was_clicked"].values, preds_ens, lengths)
        print(f"MRR {mrr}")
        return -mrr

    coefs = fmin(opt, [0] * preds_stack.shape[1])
    coefs = fmin(opt, coefs, ftol=0.000_001)

    final["click_proba"] = preds_stack.dot(coefs)
    mrr = mrr_fast(final, "click_proba")
    mrr_str = f"{mrr:.4f}"[2:]
    print(mrr)
Exemplo n.º 5
0

def mrr_metric(train_data, preds):
    mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values)
    return "error", mrr, True


model = LGBMRanker(learning_rate=0.05,
                   n_estimators=900,
                   min_child_samples=5,
                   min_child_weight=0.00001,
                   n_jobs=-2)
model.fit(
    mat_train,
    df_train["was_clicked"],
    group=group_lengths(df_train["clickout_id"]),
    # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1),
    verbose=True,
    eval_set=[(mat_val, df_val["was_clicked"])],
    eval_group=[group_lengths(df_val["clickout_id"])],
    eval_metric=mrr_metric,
)

df_train["click_proba"] = model.predict(mat_train)
df_val["click_proba"] = model.predict(mat_val)

print(mrr_fast(df_val, "click_proba"))
print("By rank")
for n in range(1, 10):
    print(n, mrr_fast(df_val[df_val["clickout_step_rev"] == n], "click_proba"))