Пример #1
0
def train_model():
    train = pd.read_csv('data/train_new_features.csv')
    train = train.drop(
        columns=['city', 'reference', 'action_type', 'hotel_cat'])
    y_train = train[['label']]
    x_train = train.drop(columns=['label'])
    groups = group_lengths(x_train["session_id"].values)
    x_train = x_train.drop(columns=['user_id', 'session_id'])
    ranker = LGBMRanker(PARAMS)
    ranker.fit(x_train, y_train.values.ravel(), group=groups, verbose=1)
Пример #2
0
def parse_model_instance(model_config):
    model_class = model_config["model_class"]
    model_params = model_config["model_params"]
    if model_class == "LGBMRanker":
        model_instance = LGBMRanker(**model_params)
    elif model_class == "LGBMRankerMRR":
        model_instance = LGBMRankerMRR(**model_params)
    elif model_class == "LGBMRankerMRR2":
        model_instance = LGBMRankerMRR2(**model_params)
    elif model_class == "LGBMRankerMRR3":
        model_instance = LGBMRankerMRR3(**model_params)
    else:
        assert False
    return model_instance
Пример #3
0
def get_predictions(df: pd.DataFrame, model: lgbm.LGBMRanker):
    """

    :param df:
    :return:
    """

    print(f'\tPredicting relevance')
    test_pred = model.predict(df)
    df['relevance'] = test_pred
    df.sort_values(by=['srch_id', 'relevance'],
                   ascending=[True, False],
                   inplace=True)
    kaggle_answer = pd.DataFrame({
        'srch_id': df['srch_id'],
        'prop_id': df['prop_id']
    })
    print(f'\t Writing answers to csv')
    kaggle_answer.to_csv('expedia_answer.csv', index=False)
Пример #4
0
def train_lgbm(df: pd.DataFrame,
               gbm: lgbm.LGBMRanker = None,
               cv: bool = False):
    """

    :param df:
    :param seed: random seed
    :return:
    """

    df.drop(['click_bool', 'booking_bool', 'position'], axis=1, inplace=True)
    categorical_values = [
        x for x in [
            'prop_country_id', 'srch_id', 'site_id',
            'visitor_location_country_id', 'prop_id', 'srch_destination_id'
        ] if x in df.columns.values
    ]
    if cv:
        print("\tSplitting data")

        train_ids, val_ids = split_data(df)
        cv_scores = []

        for i, train_id in enumerate(train_ids):

            train_data = df.loc[df['srch_id'].isin(train_id)]
            val_data = df.loc[df['srch_id'].isin(val_ids[i])]

            y_train, y_val = train_data['relevance'], val_data['relevance']
            X_train, X_val = train_data.drop(
                'relevance', axis=1), val_data.drop('relevance', axis=1)

            train_queries = list(
                Counter(np.asarray(X_train['srch_id'])).values())
            val_queries = list(Counter(np.asarray(X_val['srch_id'])).values())

            gbm = lgbm.LGBMRanker(n_estimators=700)
            print(f"\tTraining LGBM Ranker, fold {i+1}")

            gbm.fit(X_train,
                    y_train,
                    group=train_queries,
                    eval_set=[(X_val, y_val)],
                    eval_group=[val_queries],
                    eval_at=[5, 38],
                    early_stopping_rounds=50,
                    categorical_feature=categorical_values)

            feature_importance = zip(X_train.columns.values,
                                     gbm.feature_importances_)
            print(
                f"Feature importance: {sorted(feature_importance, key= lambda x: x[1])}"
            )
            print('\n')
            cv_scores.append(gbm.best_score_['valid_0']['ndcg@5'])

        print(cv_scores)
        save_model(gbm, cv_scores)
        return
    else:
        X_train = df.drop('relevance', axis=1)
        y_train = df['relevance']
        train_queries = list(Counter(np.asarray(X_train['srch_id'])).values())

        gbm = lgbm.LGBMRanker(n_estimators=700)
        gbm.fit(X_train,
                y_train,
                group=train_queries,
                categorical_feature=categorical_values)
        return gbm
    train_ind = np.where((meta.is_val == 0)
                         & (meta.is_test == 0))[0][:split_idx]
    # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
    val_ind = np.arange(split_idx, 4868466)
    print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))
    logger.info(
        f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
    meta_train = meta.iloc[train_ind]
    meta_val = meta.iloc[val_ind]
    X_train = mat[train_ind.min():(train_ind.max() + 1)]
    X_val = mat[val_ind.min():(val_ind.max() + 1)]
    del mat
    gc.collect()

with timer("model fitting"):
    model = LGBMRanker(**BEST_PARAMS)
    model.fit(X_train,
              meta_train["was_clicked"].values,
              group=group_lengths(meta_train["clickout_id"].values))
    val_pred = model.predict(X_val)
    train_pred = model.predict(X_train)
    logger.info("Train AUC {:.4f}".format(
        roc_auc_score(meta_train["was_clicked"].values, train_pred)))
    logger.info("Val AUC {:.4f}".format(
        roc_auc_score(meta_val["was_clicked"].values, val_pred)))
    meta_val["click_proba"] = val_pred
    logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
    githash = get_git_hash()
    meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False)
    joblib.dump(model, "model_val.joblib")
Пример #6
0
def build_pairwise_model():
    model = make_pipeline(
        data_pipeline(),
        LGBMRanker(n_estimators=200),
    )
    return model
Пример #7
0
    
    ## Create Pairs, see comment in original code
    # x_pairs, y_pairs = get_pairs(pd.DataFrame(y_train).join(x_train),granularity = 10,cutoff_ratio = 2,MAX_ITER = np.inf ,MAX_GROUP = 50000,verbose=True)
    # x_pairs.to_csv('newx.csv', index=False)
    # y_pairs.to_csv('newy.csv', index=False)
#   Load Pairs from File
    x_pairs = pd.read_csv("x_pairs.csv")
    x_pairs.set_index('Unnamed: 0', inplace=True)
    y_pairs = pd.read_csv("y_pairs.csv",header=None)
    y_pairs.set_index(0, inplace=True)
    y_pairs.columns = ['CTR']
    
    ## Train LGBMRanker
    lgbr_multi = LGBMRanker(objective = "regression",
                      learning_rate = 0.248,
                       num_leaves = 300,
                       # num_trees = 325,
                       num_trees = 10,
                       max_depth=20)
    
    lgbr_multi.fit(x_pairs,y_pairs, group = [2 for i in range(int(len(x_pairs)/2))],categorical_feature = cat_ft_list)    
    
    ## Getting All Paths
    adprofile_allwx_all_path = lgtplus.get_all_paths(lgbr_multi)    
    
    ## Getting High Frequence Patterns
    feature_names = lgbr_multi._Booster.dump_model()["feature_names"]
    print(adprofile_allwx_all_path[9])
    L2 = lgtplus.get_hi_freq_pattern(adprofile_allwx_all_path,2,50,True,feature_names)
    L3 = lgtplus.get_hi_freq_pattern(adprofile_allwx_all_path,3,50,True,feature_names)
    L4 = lgtplus.get_hi_freq_pattern(adprofile_allwx_all_path,4,50,True,feature_names)
    
Пример #8
0
    # Convert to numpy
    groups = np.array(groups)
    y = np.array(y)
    X = np.array(X)
    unique_groups = np.unique(groups)

    # Rank data
    ranked_y = np.zeros_like(y)
    for g in unique_groups:
        indices = groups == g
        ranks = rankdata(y[indices])
        ranked_y[indices] = np.array(ranks / np.max(ranks) * 1000).astype(int)

    # Ranker
    ranker = LGBMRanker(n_estimators=500,
                        learning_rate=0.05,
                        num_leaves=16,
                        label_gain=np.arange(0, 1001, 1))

    logo = LeaveOneGroupOut()

    correlations = []
    for train_index, test_index in tqdm(logo.split(X, y, groups)):
        unique, counts = np.unique(groups[train_index], return_counts=True)
        ranker.fit(X[train_index], ranked_y[train_index], group=counts)
        predictions = ranker.predict(X[test_index])
        correlation, p_value = kendalltau(ranked_y[test_index], predictions)
        print(np.unique(groups[test_index]), correlation)
        correlations.append(correlation)
    print("Mean correlation: ", np.mean(correlations))
Пример #9
0
df_train, df_val = read_data()
vectorizer = make_vectorizer_1()
mat_train = vectorizer.fit_transform(df_train, df_train["was_clicked"])
print(mat_train.shape)
mat_val = vectorizer.transform(df_val)
print(mat_val.shape)


def mrr_metric(train_data, preds):
    mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values)
    return "error", mrr, True


model = LGBMRanker(learning_rate=0.05,
                   n_estimators=900,
                   min_child_samples=5,
                   min_child_weight=0.00001,
                   n_jobs=-2)
model.fit(
    mat_train,
    df_train["was_clicked"],
    group=group_lengths(df_train["clickout_id"]),
    # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1),
    verbose=True,
    eval_set=[(mat_val, df_val["was_clicked"])],
    eval_group=[group_lengths(df_val["clickout_id"])],
    eval_metric=mrr_metric,
)

df_train["click_proba"] = model.predict(mat_train)
df_val["click_proba"] = model.predict(mat_val)