示例#1
0
def test_deprecate_position_arg():
    from sklearn.datasets import load_digits
    X, y = load_digits(return_X_y=True, n_class=2)
    w = y
    with pytest.warns(FutureWarning):
        xgb.XGBRegressor(3, learning_rate=0.1)
    model = xgb.XGBRegressor(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
        xgb.XGBClassifier(1, use_label_encoder=False)
    model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
        xgb.XGBRanker('rank:ndcg', learning_rate=0.1)
    model = xgb.XGBRanker(n_estimators=1)
    group = np.repeat(1, X.shape[0])
    with pytest.warns(FutureWarning):
        model.fit(X, y, group)

    with pytest.warns(FutureWarning):
        xgb.XGBRFRegressor(1, learning_rate=0.1)
    model = xgb.XGBRFRegressor(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
        xgb.XGBRFClassifier(1, use_label_encoder=True)
    model = xgb.XGBRFClassifier(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)
示例#2
0
    def __init__(self,
                 param=None,
                 blend_type='multiply',
                 scale=True,
                 test_model=None,
                 control_model=None,
                 ranking=True):
        if param is None:
            param = {
                'n_jobs': -1,
                'n_estimators': 10,
                'eval_metric': ['ndcg', 'map'],
                'objective': 'rank:ndcg',
                'verbose': True
            }
        elif 'verbose' not in param:
            param['verbose'] = True

        self.test_model = test_model
        self.control_model = control_model

        if self.test_model is None:
            self.test_model = xgboost.XGBRanker(**param)
        self.param = param
        if self.control_model is None:
            self.control_model = xgboost.XGBRanker(**param)
        self.blend_type = blend_type
        self.scale = scale
        self.ranking = ranking
示例#3
0
    def __init__(self, mode='local', learning_rate=0.3, min_child_weight=1, n_estimators=300, max_depth=3,
                 subsample=1, colsample_bytree=1, reg_lambda=1.0, reg_alpha=0):
        name = 'Stacking'
        cluster = 'no_cluster'
        super(Stacking, self).__init__(mode, cluster, name)

        self.current_directory = Path(__file__).absolute().parent
        self.data_directory = self.current_directory.joinpath('..', '..', 'submissions')
        # self.gt_csv = self.data_directory.joinpath('ground_truth.csv')
        self.mode = mode
        self.xg = xgb.XGBRanker(
            learning_rate=learning_rate, min_child_weight=min_child_weight, max_depth=math.ceil(
                max_depth),
            n_estimators=math.ceil(
                n_estimators),
            subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, reg_alpha=reg_alpha,
            n_jobs=-1, objective='rank:ndcg')

        self.fixed_params_dict = {
            'mode': mode,
            'cluster': cluster,
            'ask_to_load': False,
            'min_child_weight': 1,
            'subsample': 1,
            'colsample_bytree': 1,
        }

        global _best_so_far
        global _group_t
        global _kind
        _best_so_far = 0
        _group_t = []
def loadRankingModel(dataset_name, folder_path):
    model = xgb.XGBRanker(objective='rank:map',
                          learning_rate=0.1,
                          gamma=1.0,
                          max_depth=6,
                          n_estimators=4)
    model.load_model('{}/{}_xgboost_rank_model.bin'.format(
        folder_path, dataset_name))
    return model
示例#5
0
    def score_ranking(self):
        """
        Preparation of dataframes and use of XGBoost
        :return:
        """
        # CREATING DATAFRAMES FOR XGBOOST
        print(">>> Preparing the two DataFrames...")
        # Train dataframe
        self.df_train = self.df_builder.build_base_dataframe(
            users=self.df_user_id_col, items=self.df_item_id_col)
        self.df_builder.build_whole_dataframe(self.df_train)

        # Test dataframe
        self.df_test = self.df_builder.retrieve_test_dataframe()

        # BUILD TRAIN AND TEST GROUPS
        train_group = []
        test_group = []

        train_user_ids = list(self.df_train.loc[:, 'user_id'].values)
        test_user_ids = list(self.df_test.loc[:, 'user_id'].values)

        train_group.extend([self.cutoff] * len(set(train_user_ids)))
        test_group.extend([self.cutoff] * len(set(test_user_ids)))

        # DROP USELESS COLUMNS OF DF_TRAIN AND DF_TEST
        train_dropped = self.df_train.drop(labels={'user_id', 'item_id'},
                                           axis=1)
        test_dropped = self.df_test.drop(labels={'user_id', 'item_id'}, axis=1)

        print(">>> DataFrames well formed and ready to be used!")

        # LGBM TO TRAIN FASTER ON GPU
        # lgbm_group = self.xgb_dataframe.groupby('user_id').size().values

        # lightGBM_ranker = lgb.LGBMRanker(device='gpu')
        # lightGBM_ranker.fit(train_dropped, users, lgbm_group)

        # XGB RANKER AT WORK
        print(">>> Fitting of the XGB model...")
        xbg_ranker = xgb.XGBRanker()
        xbg_ranker.fit(train_dropped, train_user_ids, train_group)
        print(">>> Fitting completed!")
        # model = xgb.train(params,
        #                   dtrain,
        #                   num_round,
        #                   verbose_eval=2,
        #                   early_stopping_rounds=20)

        # print(xgb_regressor.predict(X_test))

        print(">>> Predicting the scores...")
        predictions = xbg_ranker.predict(test_dropped)
        print(">>> DONE")

        print(predictions[0:100])
示例#6
0
def train(data):
    raw = pd.read_pickle("../Data/training_set_VU_DM.pkl")

    groups = raw.srch_id.value_counts(sort=False).sort_index()

    # Remove if to many NaN values
    raw = raw.dropna(thresh=len(raw) * .8, axis=1)

    # Else fill with zeros
    raw = raw.fillna(0)

    # Increase booking weight
    y_train = (raw.click_bool + 4 * raw.booking_bool).values

    # Irrelevant features
    to_remove = [
        'srch_id', 'date_time', 'position', 'click_bool', 'booking_bool'
    ]

    X_train = raw.drop(to_remove, axis=1).values

    model = xgb.XGBRanker(tree_method='gpu_hist',
                          booster='gbtree',
                          objective='rank:ndcg',
                          random_state=42,
                          learning_rate=0.05,
                          colsample_bytree=0.9,
                          eta=0.05,
                          max_depth=6,
                          n_estimators=110,
                          subsample=0.75)

    model.fit(X_train, y_train, group=groups, verbose=True)

    n_queries = data.srch_id.nunique()
    print("Predicting...")
    for i, (query, query_group) in enumerate(data.groupby('srch_id')):
        query_batch = query_group.drop('srch_id', axis=1).values
        out = model.predict(query_batch)

        order = out.argsort()[::-1]

        # Order by learned ranking
        result = query_group.reset_index()[['srch_id',
                                            'prop_id']].reindex(order)

        # Save results
        result.to_csv("../Predictions/XGB_ndcg.csv",
                      mode='a',
                      header=(i == 0),
                      index=False)

        if i % 100 == 0:
            print(f"Predicted {i}/{n_queries}")
示例#7
0
def save_model_to_local_file(booster, model_params, meta, filename):
    from sklearn2pmml import PMMLPipeline, sklearn2pmml
    try:
        from xgboost.compat import XGBoostLabelEncoder
    except:  # noqa: E722
        # xgboost==0.82.0 does not have XGBoostLabelEncoder
        # in xgboost.compat.py
        from xgboost.sklearn import XGBLabelEncoder as XGBoostLabelEncoder

    objective = model_params.get("objective")
    bst_meta = dict()

    if objective.startswith("binary:") or objective.startswith("multi:"):
        if objective.startswith("binary:"):
            num_class = 2
        else:
            num_class = model_params.get("num_class")
            assert num_class is not None and num_class > 0, \
                "num_class should not be None"

        # To fake a trained XGBClassifier, there must be "_le", "classes_",
        # inside XGBClassifier. See here:
        # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356
        model = xgb.XGBClassifier()
        label_encoder = XGBoostLabelEncoder()
        label_encoder.fit(list(range(num_class)))
        model._le = label_encoder
        model.classes_ = model._le.classes_

        bst_meta["_le"] = {"classes_": model.classes_.tolist()}
        bst_meta["classes_"] = model.classes_.tolist()
    elif objective.startswith("reg:"):
        model = xgb.XGBRegressor()
    elif objective.startswith("rank:"):
        model = xgb.XGBRanker()
    else:
        raise ValueError(
            "Not supported objective {} for saving PMML".format(objective))

    model_type = type(model).__name__
    bst_meta["type"] = model_type

    # Meta data is needed for saving sklearn pipeline. See here:
    # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356
    booster.set_attr(scikit_learn=json.dumps(bst_meta))
    booster.save_model(filename)
    save_model_metadata("model_meta.json", meta)
    booster.set_attr(scikit_learn=None)
    model.load_model(filename)

    pipeline = PMMLPipeline([(model_type, model)])
    sklearn2pmml(pipeline, "{}.pmml".format(filename))
示例#8
0
def run_scikit_model_check(name, path):
    if name.find('reg') != -1:
        reg = xgboost.XGBRegressor()
        reg.load_model(path)
        config = json.loads(reg.get_booster().save_config())
        if name.find('0.90') != -1:
            assert config['learner']['learner_train_param'][
                'objective'] == 'reg:linear'
        else:
            assert config['learner']['learner_train_param'][
                'objective'] == 'reg:squarederror'
        assert (len(reg.get_booster().get_dump()) == gm.kRounds * gm.kForests)
        run_model_param_check(config)
    elif name.find('cls') != -1:
        cls = xgboost.XGBClassifier()
        cls.load_model(path)
        if name.find('0.90') == -1:
            assert len(cls.classes_) == gm.kClasses
            assert len(cls._le.classes_) == gm.kClasses
            assert cls.n_classes_ == gm.kClasses
        assert (len(cls.get_booster().get_dump()) == gm.kRounds * gm.kForests *
                gm.kClasses), path
        config = json.loads(cls.get_booster().save_config())
        assert config['learner']['learner_train_param'][
            'objective'] == 'multi:softprob', path
        run_model_param_check(config)
    elif name.find('ltr') != -1:
        ltr = xgboost.XGBRanker()
        ltr.load_model(path)
        assert (len(ltr.get_booster().get_dump()) == gm.kRounds * gm.kForests)
        config = json.loads(ltr.get_booster().save_config())
        assert config['learner']['learner_train_param'][
            'objective'] == 'rank:ndcg'
        run_model_param_check(config)
    elif name.find('logitraw') != -1:
        logit = xgboost.XGBClassifier()
        logit.load_model(path)
        assert (len(logit.get_booster().get_dump()) == gm.kRounds *
                gm.kForests)
        config = json.loads(logit.get_booster().save_config())
        assert config['learner']['learner_train_param'][
            'objective'] == 'binary:logitraw'
    elif name.find('logit') != -1:
        logit = xgboost.XGBClassifier()
        logit.load_model(path)
        assert (len(logit.get_booster().get_dump()) == gm.kRounds *
                gm.kForests)
        config = json.loads(logit.get_booster().save_config())
        assert config['learner']['learner_train_param'][
            'objective'] == 'binary:logistic'
    else:
        assert False
示例#9
0
 def run_pr_auc_ltr(self, tree_method):
     from sklearn.datasets import make_classification
     X, y = make_classification(128, 4, n_classes=2, random_state=1994)
     ltr = xgb.XGBRanker(tree_method=tree_method, n_estimators=16)
     groups = np.array([32, 32, 64])
     ltr.fit(X,
             y,
             group=groups,
             eval_set=[(X, y)],
             eval_group=[groups],
             eval_metric="aucpr")
     results = ltr.evals_result()["validation_0"]["aucpr"]
     assert results[-1] >= 0.99
    def _run_xgb_ranker_converter(self, num_classes, extra_config={}):
        warnings.filterwarnings("ignore")
        for max_depth in [1, 3, 8, 10, 12]:
            model = xgb.XGBRanker(n_estimators=10, max_depth=max_depth)
            np.random.seed(0)
            X = np.random.rand(100, 200)
            X = np.array(X, dtype=np.float32)
            y = np.random.randint(num_classes, size=100)

            model.fit(X, y, group=[X.shape[0]])

            torch_model = hummingbird.ml.convert(model, "torch", X, extra_config=extra_config)
            self.assertIsNotNone(torch_model)
            np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-06, atol=1e-06)
    def test_float64_xgb_ranker_converter(self):
        warnings.filterwarnings("ignore")
        num_classes = 3
        for max_depth in [1, 3, 8, 10, 12]:
            model = xgb.XGBRanker(n_estimators=10, max_depth=max_depth)
            np.random.seed(0)
            X = np.random.rand(100, 200)
            y = np.random.randint(num_classes, size=100)

            model.fit(X, y, group=[X.shape[0]])

            torch_model = hummingbird.ml.convert(model, "torch", X[0:1])
            self.assertIsNotNone(torch_model)
            np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-06, atol=1e-06)
示例#12
0
    def __init__(self,
                 mode='local',
                 learning_rate=0.3,
                 min_child_weight=1,
                 n_estimators=300,
                 max_depth=3,
                 subsample=1,
                 colsample_bytree=1,
                 reg_lambda=1.0,
                 reg_alpha=0):
        name = 'gbdt_hybrid'
        cluster = 'no_cluster'
        super(Gbdt_Hybrid, self).__init__(mode, cluster, name)

        self.current_directory = Path(__file__).absolute().parent
        self.data_directory = self.current_directory.joinpath(
            '..', '..', 'submissions/hybrid')
        #self.gt_csv = self.data_directory.joinpath('ground_truth.csv')
        self.mode = mode
        self.full = data.full_df()

        self.local_target_indices = data.target_indices(mode='local',
                                                        cluster='no_cluster')
        self.full_target_indices = data.target_indices(mode='full',
                                                       cluster='no_cluster')

        directory = self.data_directory.joinpath('local')

        full_dir = self.data_directory.joinpath('full')

        self.xg = xgb.XGBRanker(learning_rate=learning_rate,
                                min_child_weight=min_child_weight,
                                max_depth=math.ceil(max_depth),
                                n_estimators=math.ceil(n_estimators),
                                subsample=subsample,
                                colsample_bytree=colsample_bytree,
                                reg_lambda=reg_lambda,
                                reg_alpha=reg_alpha,
                                n_jobs=-1,
                                objective='rank:ndcg')

        self.cv_path = self.data_directory.joinpath('cross_validation')
def rank_XGBoost(X, y, X_test, group_size, df_test):
    """Makes XGBoost ranking"""

    xgb_rank = xgb.XGBRanker(objective='rank:ndcg')
    xgb_rank.fit(X, y, group_size)
    preds = xgb_rank.predict(X_test)

    # xgb.plot_importance(xgb_rank)
    # plt.show()

    # make submission
    xgb_dataset = pd.DataFrame(preds, columns=['xgb_preds'])

    xgb_merged = pd.concat([df_test[['srch_id', 'prop_id']], xgb_dataset],
                           axis=1,
                           sort=False)
    xgb_ranking = xgb_merged.join(
        xgb_merged.groupby('srch_id')['xgb_preds'].rank(
            ascending=False).astype(int).rename('xgb_rank'))

    return xgb_ranking
示例#14
0
def model_trainer(train_set, val_set, MODEL='LGB', SUFFIX='val', SAVE=False):
    """Train model
    """
    logger.info(f'Training {MODEL} in {SUFFIX} mode...')

    # get hyperparameters
    params = hypara_dispatcher(MODEL)

    # fit
    if MODEL == 'LGB':
        # train data
        dtrain_set = lgb.Dataset(
            train_set['X'].values
            , train_set['y'].values
            , feature_name=features
            )
        if SUFFIX == 'val':
            # val data
            dval_set = lgb.Dataset(
                val_set['X'].values
                , val_set['y'].values
                , feature_name=features
                )

            # train
            model = lgb.train(
                params
                , dtrain_set
                , valid_sets=[dtrain_set, dval_set]
                , early_stopping_rounds=100
                , verbose_eval=100
                )
        else:
            # train
            model = lgb.train(
                params
                , dtrain_set
                )

    elif MODEL == 'XGB':
        # model
        model = xgb.XGBRegressor(**params)
        if SUFFIX == 'val':    
            # train
            model.fit(
                train_set['X'], train_set['y'], 
                eval_set=[(val_set['X'], val_set['y'])],
                verbose=500, 
                early_stopping_rounds=100,
            )
        else:
            # train
            model.fit(
                train_set['X'], train_set['y'], 
                verbose=500, 
            )
            
    elif MODEL == 'XGBRank':
        # model
        model = xgb.XGBRanker(**params)
        if SUFFIX == 'val':
            model.fit(
                train_set['X'], train_set['y'], 
                eval_set=[(val_set['X'], val_set['y'])],
                group=train_set['g'],
                eval_group=[val_set['g']],
                verbose=100, 
                early_stopping_rounds=100,
            )
        else:
            model.fit(
                train_set['X'], train_set['y'], 
                group=train_set['g'],
                verbose=100, 
            )

    # save model
    if SAVE:
        if MODEL[-1] == 'B':
            # save via joblib
            joblib.dump(model, f'{OUTPUT_DIR}/{target}_{MODEL}_model_{SUFFIX}.pkl')
            logger.info(f'{MODEL} {SUFFIX}_model for {target} saved!')

    return model
示例#15
0
def GridSearch(feature, label, group, metrics, scoring=0.5, cv=5, cv_num=3,
               metrics_min=True, speedy=True, speedy_param=(20000, 0.3), gpu=False):
    """XGBRanker model params search use GridSearch method.
    
    Args:
        feature: pandas dataframe, model's feature.
        label: pandas series, model's label.
        loss: XGBRanker param 'objective'.
        metrics: model metrics function.
        scoring: metrics error opt base line value.
        cv: cross validation fold.
        cv_num: minimum cross validation fold.
        metrics_min: metrics value whether the smaller the better.
        speedy: whether use speedy method.
        speedy_param: if use speedy method, test_size will be set, 
                      test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2).
        gpu: whether use gpu.
    Returns:
        a best XGBRanker model params dict.
    Raises:
        params error.
    """
    def product(x):
        if len(x)==1:
            return itertools.product(x[0])
        elif len(x)==2:
            return itertools.product(x[0], x[1])
        else:
            return itertools.product(x[0], x[1], x[2])
    start = time.time()
    if gpu:
        raise "XGBRanker is not supported currently."
    if speedy:
        test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2)
    tree_method = 'gpu_hist' if gpu else 'auto'
    n_job = 1 if gpu else int(np.ceil(cpu_count()*0.9))
    weight_dict = Counter(label)
    if len(weight_dict)==2:
        weight = int(np.ceil(weight_dict[min(weight_dict)]/weight_dict[max(weight_dict)]))
    else:
        weight_dict = {j:i for i,j in weight_dict.items()}
        weight = int(np.ceil(weight_dict[max(weight_dict)]/weight_dict[min(weight_dict)]))
    params = {'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1,
              'reg_alpha': 0, 'reg_lambda': 1, 'gamma': 0,
              'subsample': 0.8, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8,
              'max_delta_step': 0, 'scale_pos_weight': weight,
              'n_jobs':n_job, 'random_state': 27, 'objective': 'rank:pairwise', 'tree_method':tree_method}
    cv_params = {'param1':{'n_estimators': list(range(100, 850, 50))},
                 'param2':{'max_depth': [3, 4, 5, 6, 7],
                           'min_child_weight': [1, 2, 3, 4, 5]},
                 'param3':{'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]},
                 'param4':{'subsample': [0.6, 0.7, 0.8, 0.9],
                           'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
                           'colsample_bylevel': [0.6, 0.7, 0.8, 0.9]},
                 'param5':{'reg_alpha': [0.05, 0.1, 1, 2, 3],
                           'reg_lambda': [0.05, 0.1, 1, 2, 3]},
                 'param6':{'max_delta_step': list(np.linspace(0, 10, 11, dtype='int')),
                           'scale_pos_weight': list(np.linspace(1, weight, weight, dtype='int'))},
                 'param7':{'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2]}}
    for _, cv_param in cv_params.items():
        cv_param_name = [i for i in cv_param]
        cv_param_value = [cv_param[i] for i in cv_param_name]
        cv_param_iter = product(cv_param_value)
        for value in cv_param_iter:
            params.update({name:name_value for name, name_value in zip(cv_param_name, value)})
            model = xgb.XGBRanker(**params)
            score = []
            if speedy:
                for i in range(cv_num):
                    X_train, X_test, y_train, y_test, g_train, g_test = train_test_split(feature, label, group, 
                                                                                         test_size=test_size, stratify=label, 
                                                                                         random_state=np.random.choice(range(100), 1)[0])
                    model.fit(X_train, y_train, g_train)
                    cv_pred = model.predict(X_test)
                    score.append(metrics(y_test.values, cv_pred))
            else:
                skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=np.random.choice(range(100), 1)[0])
                for n, (train_index, test_index) in enumerate(skf.split(feature, label)):
                    if n == cv_num:
                        break
                    model.fit(feature.loc[train_index], label[train_index], group[train_index])
                    cv_pred = model.predict(feature.loc[test_index])
                    score.append(metrics(label[test_index].values, cv_pred))
            cv_score = round(np.mean(score), 4)
            if metrics_min:
                if cv_score<scoring:
                    scoring = cv_score
                    best_params = params.copy()
            else:
                if cv_score>scoring:
                    scoring = cv_score
                    best_params = params.copy()
        params = best_params.copy()
        sys.stdout.write("XGBRanker grid search run time {} min, best score: {}, best param:{}\r".format(
            divmod((time.time()-start),60)[0], scoring, best_params))
        sys.stdout.flush()
    print("XGBRanker param finetuning with grid search run time: %d min %.2f s" % divmod((time.time() - start), 60))
    return best_params
示例#16
0
def Model_Search_XGboost_cv(X,
                            y,
                            model='binary',
                            folds=5,
                            sklearn_metric=None,
                            xgboost_metric=None,
                            step_wise_start_at=0,
                            final_learning_rate=0.01,
                            use_optuna=False,
                            direction='minimize',
                            n_trials=25,
                            load_study_from=None,
                            save_study_as=None,
                            n_jobs=4):

    # model
    if isinstance(model, str):
        if model == 'binary':
            model = xgboost.XGBClassifier(objective='binary:logistic',
                                          eval_metric='logloss',
                                          random_state=42,
                                          seed=42,
                                          feature_fraction_seed=42,
                                          use_label_encoder=False,
                                          nthread=n_jobs)
        elif model == 'multiclass':
            n_classes = len(np.unique(y))
            model = xgboost.XGBClassifier(objective='multi:softmax',
                                          eval_metric='mlogloss',
                                          num_class=n_classes,
                                          random_state=42,
                                          seed=42,
                                          feature_fraction_seed=42,
                                          use_label_encoder=False,
                                          nthread=n_jobs)
        elif model == 'regression':
            model = xgboost.XGBRegressor(objective='reg:squarederror',
                                         eval_metric='rmse',
                                         random_state=42,
                                         seed=42,
                                         feature_fraction_seed=42,
                                         use_label_encoder=False,
                                         nthread=n_jobs)
        elif model == 'ranking':
            model = xgboost.XGBRanker(objective='rank:map',
                                      eval_metric='map',
                                      random_state=42,
                                      seed=42,
                                      feature_fraction_seed=42,
                                      use_label_encoder=False,
                                      nthread=n_jobs)
        else:
            sys.exit('Error: Unkown model type.')

    # sklearn_metric
    if sklearn_metric is None:  # https://scikit-learn.org/stable/modules/model_evaluation.html
        if isinstance(model, xgboost.XGBClassifier):
            sklearn_metric = 'neg_log_loss'
        elif isinstance(model, xgboost.XGBRegressor):
            sklearn_metric = 'neg_root_mean_squared_error'
        elif isinstance(model, xgboost.XGBRanker):
            sklearn_metric = 'average_precision_score'
        else:
            sys.exit('Error: Sklearn score metric needs to be provided.')

    # xgboost_metric
    if xgboost_metric is None:  # https://xgboost.readthedocs.io/en/latest/parameter.html
        if isinstance(model, xgboost.XGBClassifier):
            n_classes = len(np.unique(y))
            if n_classes > 2:
                xgboost_metric = 'mlogloss'
            else:
                xgboost_metric = 'logloss'
        elif isinstance(model, xgboost.XGBRegressor):
            xgboost_metric = 'rmse'
        elif isinstance(model, xgboost.XGBRanker):
            xgboost_metric = 'map'
        else:
            sys.exit('Error: Xgboost score metric needs to be provided.')

    # folds
    if isinstance(folds, int):
        if isinstance(model, xgboost.XGBClassifier):
            folds = StratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=42)
        else:
            folds = KFold(n_splits=folds, shuffle=True, random_state=42)

    # ------------------------------------------------------------------------------------------------

    # Set fixed params
    fixed_params = {
        'verbosity': 0,
        'silent': 1,

        # 'num_iterations': 10000,
        'n_estimators': 10000,
    }
    model.set_params(**fixed_params)

    # dataset for .cv
    d_train = xgboost.DMatrix(X, label=y)

    # ------------------------------------------------------------------------------------------------

    if use_optuna:
        print("Searching for a Xgboost model with optuna \n")

        params = model.get_params()

        if load_study_from is not None:
            study = joblib.load(load_study_from)
        else:
            study = optuna.create_study(
                direction=direction,
                pruner=optuna.pruners.SuccessiveHalvingPruner())

        def objetive(trial):

            params.update({
                "learning_rate":
                trial.suggest_uniform("learning_rate", 0.005, 0.1),
                "max_depth":
                trial.suggest_int("max_depth", 4, 12),
                "min_child_weight":
                trial.suggest_int("min_child_weight", 1, 50),
                "gamma":
                trial.suggest_loguniform("gamma", 1e-4, 1e4),
                "subsample":
                trial.suggest_loguniform("subsample", 0.4, 1),
                "colsample_bytree":
                trial.suggest_loguniform("colsample_bytree", 0.4, 1),
                "alpha":
                trial.suggest_loguniform("alpha", 1e-8, 1e4),
                "lambda":
                trial.suggest_loguniform("lambda", 1e-8, 1e4),
            })

            cv_results = xgboost.cv(params,
                                    d_train,
                                    num_boost_round=10000,
                                    early_stopping_rounds=50,
                                    folds=folds,
                                    metrics=xgboost_metric,
                                    show_stdv=False,
                                    verbose_eval=None,
                                    as_pandas=False)

            rmetric_name = list(cv_results.keys())[2]
            score = cv_results[rmetric_name][
                -1]  # np.min(cv_results[rmetric_name])

            print("Num_boost_round: " + str(len(cv_results[rmetric_name])))

            if save_study_as is not None:
                joblib.dump(study, save_study_as)

            return score

        study.optimize(objetive, n_trials=n_trials, n_jobs=1)

        print(
            "------------------------------------------------------------------------"
        )
        print("Best parameters found: " + str(study.best_params))
        print("Best score achived: " + str(study.best_value))
        print(
            "------------------------------------------------------------------------"
        )

        model.set_params(**study.best_params)

        # num_boost_round optimization
        cv_results = xgboost.cv(model.get_params(),
                                d_train,
                                num_boost_round=10000,
                                early_stopping_rounds=50,
                                folds=folds,
                                metrics=xgboost_metric,
                                show_stdv=False,
                                verbose_eval=None,
                                as_pandas=False)

        rmetric_name = list(cv_results.keys())[2]
        best_boost_round = len(cv_results[rmetric_name])
        best_score_achived = cv_results[rmetric_name][-1]
        print("Best num_boost_round: " + str(best_boost_round))
        print("Best score achived: " + str(best_score_achived))
        print(
            "------------------------------------------------------------------------"
        )
        model.set_params(n_estimators=best_boost_round)

    else:
        print("Searching for a Xgboost model with the step wise method \n")

        if step_wise_start_at <= 0:
            # num_boost_round optimization
            cv_results = xgboost.cv(model.get_params(),
                                    d_train,
                                    num_boost_round=10000,
                                    early_stopping_rounds=50,
                                    folds=folds,
                                    metrics=xgboost_metric,
                                    show_stdv=False,
                                    verbose_eval=None,
                                    as_pandas=False)

            rmetric_name = list(cv_results.keys())[2]
            best_boost_round = len(cv_results[rmetric_name])
            best_score_achived = cv_results[rmetric_name][-1]
            print(
                "------------------------------------------------------------------------"
            )
            print("Best num_boost_round: " + str(best_boost_round))
            print("Best score achived: " + str(best_score_achived))
            print(
                "------------------------------------------------------------------------"
            )
            model.set_params(n_estimators=best_boost_round)

        # Param search
        if step_wise_start_at <= 1:
            param_test = {'max_depth': range(2, 11, 1)}
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 2:
            param_test = {
                'min_child_weight':
                [0, 1, 2, 3, 5, 7, 10, 12, 15, 25, 50, 75, 100]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 3:
            param_test = {
                'gamma': [
                    0, 0.0001, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7,
                    10, 40, 70, 100
                ]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best parameters found: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 4:
            param_test = {
                'subsample': [
                    0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9,
                    0.95, 1
                ]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best parameters found: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 5:
            param_test = {
                'colsample_bytree':
                [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best parameters found: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 6:
            param_test = {
                'alpha': [
                    0, 0.0001, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7,
                    10, 40, 70, 100, 200
                ]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best parameters found: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 7:
            param_test = {
                'lambda': [
                    0, 0.0001, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7,
                    10, 40, 70, 100, 200
                ]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=1,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y)
            print("Best parameters found: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        # Get model from pipeline
        model = pipe_model.named_steps['model']

        # Set final learning rate
        model.set_params(learning_rate=final_learning_rate)

        # num_boost_round optimization
        model.set_params(num_iterations=10000)
        cv_results = xgboost.cv(model.get_params(),
                                d_train,
                                num_boost_round=10000,
                                early_stopping_rounds=50,
                                folds=folds,
                                metrics=xgboost_metric,
                                show_stdv=False,
                                verbose_eval=None,
                                as_pandas=False)

        rmetric_name = list(cv_results.keys())[2]
        best_boost_round = len(cv_results[rmetric_name])
        best_score_achived = cv_results[rmetric_name][-1]
        print(
            "------------------------------------------------------------------------"
        )
        print("Best num_boost_round: " + str(best_boost_round))
        print("Best score achived: " + str(best_score_achived))
        print(
            "------------------------------------------------------------------------"
        )
        model.set_params(n_estimators=best_boost_round)

    return model
示例#17
0
train = base_expanded_df(alpha=0.2, beta=0.05, isValidation=True, save=True)
test = base_expanded_df(alpha=0.2, beta=0.05, isValidation=False, save=True)
#train = pd.read_csv("dataset/expanded/base_expanded_train.csv")
#test = pd.read_csv("dataset/expanded/base_expanded_test.csv")

train = adding_features(train, isValidation=True)
test = adding_features(test, isValidation=False)
train.to_csv('dataset/expanded/train_complete.csv', index=False)
test.to_csv('dataset/expanded/test_complete.csv', index=False)

train = pd.read_csv("train_complete.csv")
test = pd.read_csv("test_complete.csv")

group = train.groupby('queried_record_id').size().values
ranker = xgb.XGBRanker()
ranker.fit(train.drop(['queried_record_id', 'target', 'linked_id_idx'],
                      axis=1),
           train['target'],
           group=group)

predictions = ranker.predict(
    test.drop(['queried_record_id', 'linked_id_idx'], axis=1))
test['predictions'] = predictions
df_predictions = test[[
    'queried_record_id', 'predicted_record_id', 'predicted_record_id_record',
    'predictions'
]]

rec_pred = []
for (r, p, l, record_id) in zip(df_predictions.predicted_record_id,
示例#18
0
# 并且需要给定每个query_id下样本的数量
groups = train_data.groupby('id').size().to_frame(
    'size')['size'].to_numpy()  # 计算每个query_id 下的样品数量

test_data = df.iloc[X_test_inds]
# We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['rank'])]
y_test = test_data.loc[:, test_data.columns.isin(['rank'])]

# 然后我们就可以建模了,可以用XGBRanker训练排序模型,在这个场景下,我们无法自定义objective,也无法自定义mertic了.
model = xgb.XGBRanker(
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',  # rank:ndcg rank:map
    random_state=42,
    learning_rate=0.1,
    colsample_bytree=0.9,
    eta=0.05,
    max_depth=6,
    n_estimators=110,
    subsample=0.75)
model.fit(X_train, y_train, group=groups, verbose=True)


# 训练完后我们就可以进行预估,因为预估方法并不会输入groups,所以我们需要做一些特殊处理:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['id'])])


predictions = (data.groupby('id').apply(lambda x: predict(model, x)))
"""
    def __init__(self,
                 mode,
                 cluster='no_cluster',
                 kind='kind1',
                 ask_to_load=True,
                 class_weights=False,
                 learning_rate=0.1658,
                 min_child_weight=0.5644,
                 n_estimators=100,
                 max_depth=11,
                 subsample=1,
                 colsample_bytree=1,
                 reg_lambda=65.0,
                 reg_alpha=50.0,
                 max_delta_step=2,
                 scale_pos_weight=20,
                 gamma=0.01,
                 weights_position=False,
                 log_weights=False):
        #name = 'xgboost_ranker_mode={}_cluster={}_kind={}_class_weights={}_learning_rate={}_min_child_weight={}_n_estimators={}_max_depth={}_subsample={}_colsample_bytree={}_reg_lambda={}_reg_alpha={}_max_delta_step={}_scale_pos_weight={}_gamma={}_weights_position={}_log_weights={}'.format(
        #    mode, cluster, kind, class_weights, learning_rate, min_child_weight, n_estimators, max_depth, subsample, colsample_bytree, reg_lambda, reg_alpha, max_delta_step, scale_pos_weight,gamma, weights_position, log_weights
        #)
        name = 'final_stacking'
        super(XGBoostWrapper, self).__init__(name=name,
                                             mode=mode,
                                             cluster=cluster)
        self.class_weights = class_weights
        self.weights_position = weights_position
        self.log_weights = log_weights
        self.kind = kind
        self.ask_to_load = ask_to_load

        self.xg = xgb.XGBRanker(learning_rate=learning_rate,
                                min_child_weight=min_child_weight,
                                max_depth=math.ceil(max_depth),
                                n_estimators=math.ceil(n_estimators),
                                gamma=gamma,
                                max_delta_step=math.ceil(max_delta_step),
                                scale_pos_weight=math.ceil(scale_pos_weight),
                                subsample=subsample,
                                colsample_bytree=colsample_bytree,
                                reg_lambda=reg_lambda,
                                reg_alpha=reg_alpha,
                                n_jobs=-1,
                                objective='rank:pairwise')

        self.fixed_params_dict = {
            'mode': mode,
            'cluster': cluster,
            'kind': kind,
            'ask_to_load': False,
            'min_child_weight': 1,
            'subsample': 1,
            'colsample_bytree': 1,
            'max_delta_step': 0,
            'scale_pos_weight': 1,
            'gamma': 0
        }

        # create hyperparameters dictionary
        self.hyperparameters_dict = {
            'learning_rate': (0.01, 0.3),
            'max_depth': (3, 7),
            'n_estimators': (700, 1200),
            'reg_lambda': (0, 0.5),
            'reg_alpha': (0, 0.5)
        }
示例#20
0
def RandomSearch(feature, label, group, metrics, iter_num=1000, scoring=0.5, cv=5, cv_num=3,
                 metrics_min=True, speedy=True, speedy_param=(20000, 0.3), gpu=False,
                 save_model_dir=None
                ):
    """XGBRanker model params search use RandomSearch method.
    
    Args:
        feature: pandas dataframe, model's feature.
        label: pandas series, model's label.
        group: label group.
        metrics: model metrics function.
        scoring: metrics error opt base line value.
        cv: cross validation fold.
        cv_num: minimum cross validation fold.
        metrics_min: metrics value whether the smaller the better.
        speedy: whether use speedy method.
        speedy_param: if use speedy method, test_size will be set, 
                      test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2).
        gpu: whether use gpu.
        save_model_dir: save model folder.
    Returns:
        a best XGBRanker model params dict.
    Raises:
        params error.
    """
    import xgboost as xgb
    assert xgb.__version__>=__xgboost_version__, f'xgboost version should be >={__xgboost_version__}.'
    start = time.time()
    if gpu:
        raise "XGBRanker is not supported currently."
    best_params={}
    if speedy:
        test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2)
    tree_method = ['gpu_hist'] if gpu else ['auto', 'exact', 'approx', 'hist']
    n_job = 1 if gpu else int(np.ceil(cpu_count()*0.9))
    weight_dict = Counter(label)
    if len(weight_dict)==2:
        weight = int(np.ceil(weight_dict[min(weight_dict)]/weight_dict[max(weight_dict)]))
    else:
        weight_dict = {j:i for i,j in weight_dict.items()}
        weight = int(np.ceil(weight_dict[max(weight_dict)]/weight_dict[min(weight_dict)]))
    
    hp = HyperParametersRandom()
    hp.Float('learning_rate', 0.01, 0.1)
    hp.Int('n_estimators', 100, 850)
    hp.Choice('max_depth', [3, 4, 5, 6, 7])
    hp.Choice('min_child_weight', [1, 2, 3, 4, 5, 6, 7])
    hp.Choice('max_delta_step', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    hp.Choice('reg_alpha', np.concatenate([np.linspace(0, 1, 101), np.linspace(2, 100, 99)]).round(2))
    hp.Choice('reg_lambda', np.concatenate([np.linspace(0, 1, 101), np.linspace(2, 100, 99)]).round(2))
    hp.Choice('subsample', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
    hp.Choice('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
    hp.Choice('colsample_bylevel', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
    hp.Choice('colsample_bynode', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
    hp.Choice('gamma', np.concatenate([np.linspace(0, 1, 101), np.linspace(2, 100, 99)]).round(2))
    hp.Choice('scale_pos_weight', [1, weight])
    hp.Choice('n_jobs', [n_job])
    hp.Choice('random_state', [27])
    hp.Choice('objective', ['rank:pairwise'])
    hp.Choice('booster', ['gbtree'])
    hp.Choice('tree_method', tree_method)
    hp.Choice('importance_type', ["gain", "weight", "cover", "total_gain", "total_cover"])
    
    for i in range(1, iter_num+1):
        hp.update()
        model = xgb.XGBRanker(**hp.params)
        score = []
        if speedy:
            for _ in range(cv_num):
                X_train, X_test, y_train, y_test, g_train, g_test = train_test_split(feature, label, group, 
                                                                                     test_size=test_size, stratify=label, 
                                                                                     random_state=np.random.choice(range(100), 1)[0])
                model.fit(X_train, y_train, g_train)
                cv_pred = model.predict(X_test)
                score.append(metrics(y_test.values, cv_pred))
        else:
            skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=np.random.choice(range(100), 1)[0])
            for n, (train_index, test_index) in enumerate(skf.split(feature, label)):
                if n == cv_num:
                    break
                model.fit(feature.loc[train_index], label[train_index], group[train_index])
                cv_pred = model.predict(feature.loc[test_index])
                score.append(metrics(label[test_index].values, cv_pred))
        cv_score = round(np.mean(score), 4)
        if metrics_min:
            if cv_score<scoring:
                scoring = cv_score
                best_params = params.copy()
                if save_model_dir is not None:
                    pickle.dump(model, open(os.path.join(save_model_dir, "xgb_model.pkl"), "wb"))
                    with open(os.path.join(save_model_dir, "xgb_params.json"),'w') as f:
                        json.dump(best_params, f)
        else:
            if cv_score>scoring:
                scoring = cv_score
                best_params = params.copy()
                if save_model_dir is not None:
                    pickle.dump(model, open(os.path.join(save_model_dir, "xgb_model.pkl"), "wb"))
                    with open(os.path.join(save_model_dir, "xgb_params.json"),'w') as f:
                        json.dump(best_params, f)
        sys.stdout.write("XGBRanker random search percent: {}%, run time {} min, best score: {}, best param:{}\r".format(
            round(i/iter_num*100,2), divmod((time.time()-start),60)[0], scoring, best_params))
        sys.stdout.flush()
    print("XGBRanker param finetuning with random search run time: %d min %.2f s" % divmod((time.time() - start), 60))
    return best_params
示例#21
0
dftrain = pd.read_csv('dataprep.csv', index_col=0)
print(dftrain.min())
print(dftrain.max())
print(dftrain.head())
print(dftrain[pd.isnull(dftrain).any(axis=1)])


def groupsize(df):
    srch_value = df.srch_id.value_counts()
    df_srch_count = pd.DataFrame([srch_value]).T.sort_index()

    return df_srch_count.srch_id


params = {'objective': 'rank:ndcg'}
xgb_rank = xgb.XGBRanker(**params)
x_train = dftrain.drop(['click_bool', 'booking_bool'], axis=1)
y_train = np.array(
    dftrain['click_bool']) + 4 * np.array(dftrain['booking_bool'])
x_test = x_train[4500022:4958347].copy()
y_test = y_train[4500022:4958347].copy()
x_val = x_train[4000003:4500022].copy()
y_val = y_train[4000003:4500022].copy()
x_train = x_train[0:4000003]
y_train = y_train[0:4000003]

resultTest = x_test[['srch_id', 'prop_id']].copy()
resultTest['click_bool'] = np.array(
    dftrain.loc[4500022:4958347, 'click_bool']) + 4 * np.array(
        dftrain.loc[4500022:4958347, 'booking_bool'])
示例#22
0
import xgboost as xgb
import pandas as pd

df = pd.read_csv('test/data/iris/iris.csv')

X = df.drop(columns=['Species'])
y = df['Species']
# y = y.replace(2, 1)

X_train = X[:100]
y_train = y[:100]
X_test = X[100:]
y_test = y[100:]

group_train = [20, 80]

model = xgb.XGBRanker()
model.fit(X_train, y_train, group_train)
# print(model.predict(X_test))
print(model.feature_importances_)