예제 #1
0
    def search(self, train_data, metrics, valid_data=None,
               iter_num=None, cv=3, metrics_min=True, 
               speedy=True, speedy_param=(20000, 0.3), 
               save_model_dir=None, save_model_name=None):
        """model params search method.

        Args:
            train_data: A list of (X, y, sample_weight) tuple pairs to use as train sets.
            metrics: model metrics function.
            valid_data: A list of (X, y, sample_weight) tuple pairs to use as validation sets.
            iter_num: search count.
            cv: cross validation fold.
            metrics_min: metrics value whether the smaller the better.
            speedy: whether use speedy method.
            speedy_param: if use speedy method, test_size will be set, 
                          test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2).
            save_model_dir: str, save model folder, only work with model='XGBClassifier' or 'XGBRegressor'.
            save_model_name: str, save model name prefix, only work with model='XGBClassifier' or 'XGBRegressor'.
        Returns:
            a best model params dict.
        Raises:
            params error.
        """
        logger = Logger(name=self.params.name)
        logger.info(f"Start hyperparameter {self.params.method} search.")
        import warnings
        warnings.filterwarnings("ignore")
        if speedy:
            test_size = 1-round(min(speedy_param[0], len(train_data[1])*speedy_param[1])/len(train_data[1]), 2)
        if self.params.model_name=='XGBClassifier':
            self._xgb_weight(train_data[1])
        
        if valid_data is not None:
            cv_score_list = []
            
        if self.params.method=='grid':
            if iter_num is None:
                iter_num = self.hp.cardinality()
            else:
                iter_num = min(iter_num, self.hp.cardinality())
        if iter_num is None:
            iter_num = 100
        for i in range(1, iter_num+1):
            self.hp.update(self.best_params)
            self.params.model = self.params.model_init(**self.hp.params)
            score = []
            if speedy:
                for _ in range(cv):
                    index = train_test_split(train_data[0], train_data[1], test_size, seed=np.random.choice(range(100), 1)[0])
                    score.append(self._model_fit_predict(train_data, metrics, index, mode=1))
            else:
                index_list = kfold(train_data[0], train_data[1], n_splits=cv, seed=np.random.choice(range(100), 1)[0])
                for n, index in enumerate(index_list):
                    score.append(self._model_fit_predict(train_data, metrics, index, mode=1))
            cv_score = np.mean(score)
            if valid_data is not None:
                cv_score_list.append(cv_score)
                cv_score_list.sort()
                threshold = cv_score_list[int(len(cv_score_list)*(0.2 if metrics_min else 0.8))]
                if (metrics_min==True and threshold>=cv_score) or (metrics_min==False and threshold<=cv_score):
                    cv_score = self._model_fit_predict(valid_data, metrics, index=None, mode=0)
                else:
                    logger.info(f"Model {self.params.method} search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True)
                    continue
            if i==1:
                scoring = cv_score
            if (metrics_min==True and cv_score<=scoring) or (metrics_min==False and cv_score>=scoring):
                scoring = cv_score
                self.best_params = self.hp.params.copy()
                self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()}
                if self.params.model_name in ['XGBClassifier', 'XGBRegressor']:
                    if save_model_dir is not None:
                        if save_model_name is None:
                            save_model_name = self.params.name
                        model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json"))
                        with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f:
                            json.dump(best_params, f)
            logger.info(f"Model {self.params.method} search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True)
        logger.info(f"Model {self.params.method} search best score: {scoring:.4f}", close=True, time_mode=1)
        return self.best_params
예제 #2
0
    def search(self, feature, label, vaild_data=None, sample_weight=None, 
               metrics=auc_roc, loss='regression', 
               iter_num=100, scoring=0.5, cv=5, cv_num=3, metrics_min=True, 
               speedy=True, speedy_param=(20000, 0.3), 
               save_model_dir=None, save_model_name='lgb'):
        """LGBMRegressor model params search use RandomSearch method.

        Args:
            feature: pandas dataframe, model's feature.
            label: pandas series, model's label.
            vaild_data: A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. 
            sample_weight: pd.Series or np.array, sample weight, shape is (n,).
            metrics: model metrics function, default is `la.metircs.auc_roc`.
            loss: LGBMRegressor param 'objective'.
            scoring: metrics error opt base line value.
            cv: cross validation fold.
            cv_num: if use speedy method, minimum cross validation fold.
            metrics_min: metrics value whether the smaller the better.
            speedy: whether use speedy method.
            speedy_param: if use speedy method, test_size will be set, 
                          test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2).
            save_model_dir: str, save model folder.
            save_model_name: str, save model name prefix, "`lgb`_model.json" and "`lgb`_params.json".
        Returns:
            a best LGBMRegressor model params dict.
        Raises:
            params error.
        """
        logger = Logger(name='lgb')
        logger.info(f"api is deprecated and will be removed in 1.5.0")
        logger.info(f"please use la.param_search.RandomSearch")
        import warnings
        warnings.filterwarnings("ignore")
        import lightgbm as lgb
        assert lgb.__version__>=__lightgbm_version__, f'lightgbm version should be >={__lightgbm_version__}.'
        if speedy:
            test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2)
        self.HyperParameter.Choice('objective', [loss])

        if vaild_data is not None:
            cv_score_list = []
            
        logger.info(f"Start LGBMRegressor hyperparameter random search.")
        for i in range(1, iter_num+1):
            self.HyperParameter.update()
            model = lgb.LGBMRegressor(**self.HyperParameter.params)
            score = []
            if speedy:
                for _ in range(cv_num):
                    index_list = train_test_split(feature, label, test_size=test_size, shuffle=True, seed=np.random.choice(range(100), 1)[0])
                    weight = None if sample_weight is None else sample_weight[index_list[0]]
                    model.fit(feature.loc[index_list[0]], label[index_list[0]], sample_weight=weight)
                    cv_pred = pd.Series(model.predict(feature.loc[index_list[1]]), index=label[index_list[1]].index)
                    score.append(metrics(label[index_list[1]], cv_pred))
            else:
                index_list = kfold(feature, label, n_splits=cv, shuffle=True, seed=np.random.choice(range(100), 1)[0])
                for n, index in enumerate(index_list):
                    weight = None if sample_weight is None else sample_weight[index[0]]
                    model.fit(feature.loc[index[0]], label[index[0]], sample_weight=weight)
                    cv_pred = pd.Series(model.predict(feature.loc[index[1]]), index=label[index[1]].index)
                    score.append(metrics(label[index[1]], cv_pred))
            cv_score = np.mean(score)
            if vaild_data is not None:
                cv_score_list.append(cv_score)
                if metrics_min:
                    cv_score_list.sort()
                    if cv_score_list[int(len(cv_score_list)*0.2)]>=cv_score:
                        cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index)
                        cv_score = metrics(vaild_data[1], cv_pred)
                    else:
                        logger.info(f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True)
                        continue
                else:
                    cv_score_list.sort(reverse=1)
                    if cv_score_list[int(len(cv_score_list)*0.2)]<=cv_score:
                        cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index)
                        cv_score = metrics(vaild_data[1], cv_pred)
                    else:
                        logger.info(f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True)
                        continue
            if metrics_min:
                if cv_score<scoring:
                    scoring = cv_score
                    self.best_params = self.HyperParameter.params.copy()
                    self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()}
                    if save_model_dir is not None:
                        model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json"))
                        with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f:
                            json.dump(best_params, f)
            else:
                if cv_score>scoring:
                    scoring = cv_score
                    self.best_params = self.HyperParameter.params.copy()
                    self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()}
                    if save_model_dir is not None:
                        model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json"))
                        with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f:
                            json.dump(best_params, f)
            logger.info(f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True)
        logger.info(f"LGBMRegressor random search best score: {scoring:.4f}", close=True, time_mode=1)
        return self.best_params
예제 #3
0
    def search(self,
               feature,
               label,
               vaild_data=None,
               sample_weight=None,
               metrics=auc_roc,
               loss='binary:logistic',
               iter_num=100,
               scoring=0.5,
               cv=5,
               cv_num=3,
               metrics_min=True,
               speedy=True,
               speedy_param=(20000, 0.3),
               gpu_id=-1,
               save_model_dir=None,
               save_model_name='xgb'):
        """XGBClassifier model params search use RandomSearch method.

        Args:
            feature: pandas dataframe, model's feature.
            label: pandas series, model's label.
            vaild_data: A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. 
            sample_weight: pd.Series or np.array, sample weight, shape is (n,).
            metrics: model metrics function, default is `la.metircs.auc_roc`.
            loss: XGBClassifier param 'objective'.
            scoring: metrics error opt base line value.
            cv: cross validation fold.
            cv_num: if use speedy method, minimum cross validation fold.
            metrics_min: metrics value whether the smaller the better.
            speedy: whether use speedy method.
            speedy_param: if use speedy method, test_size will be set, 
                          test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2).
            gpu_id: int, use gpu device ordinal, -1 is not use gpu.
            save_model_dir: str, save model folder.
            save_model_name: str, save model name prefix, "`xgb`_model.json" and "`xgb`_params.json".
        Returns:
            a best XGBClassifier model params dict.
        Raises:
            params error.
        """
        logger = Logger(name='xgb')
        logger.info(f"api is deprecated and will be removed in 1.5.0")
        logger.info(f"please use la.param_search.RandomSearch")
        import warnings
        warnings.filterwarnings("ignore")
        import xgboost as xgb
        assert xgb.__version__ >= __xgboost_version__, f'xgboost version should be >={__xgboost_version__}.'
        if speedy:
            test_size = 1 - round(
                min(speedy_param[0], feature.shape[0] * speedy_param[1]) /
                feature.shape[0], 2)
        tree_method = ['gpu_hist'] if gpu_id > -1 else [
            'auto', 'exact', 'approx', 'hist'
        ]
        n_job = int(np.ceil(cpu_count() * 0.8))
        weight_dict = Counter(label)
        if len(weight_dict) == 2:
            weight = int(
                np.ceil(weight_dict[min(weight_dict)] /
                        weight_dict[max(weight_dict)]))
        else:
            weight_dict = {j: i for i, j in weight_dict.items()}
            weight = int(
                np.ceil(weight_dict[max(weight_dict)] /
                        weight_dict[min(weight_dict)]))

        self.HyperParameter.Choice('n_jobs', [n_job])
        self.HyperParameter.Choice('objective', [loss])
        self.HyperParameter.Choice('tree_method', tree_method)
        self.HyperParameter.Choice('gpu_id', [gpu_id])
        self.HyperParameter.Choice('scale_pos_weight', [1, weight])

        if vaild_data is not None:
            cv_score_list = []

        logger.info(f"Start XGBClassifier hyperparameter random search.")
        for i in range(1, iter_num + 1):
            self.HyperParameter.update()
            model = xgb.XGBClassifier(**self.HyperParameter.params)
            score = []
            if speedy:
                for _ in range(cv_num):
                    index_list = train_test_split(feature,
                                                  label,
                                                  test_size=test_size,
                                                  shuffle=True,
                                                  seed=np.random.choice(
                                                      range(100), 1)[0])
                    weight = None if sample_weight is None else sample_weight[
                        index_list[0]]
                    model.fit(feature.loc[index_list[0]],
                              label[index_list[0]],
                              sample_weight=weight)
                    cv_pred = pd.Series(model.predict(
                        feature.loc[index_list[1]]),
                                        index=label[index_list[1]].index)
                    score.append(metrics(label[index_list[1]], cv_pred))
            else:
                index_list = kfold(feature,
                                   label,
                                   n_splits=cv,
                                   shuffle=True,
                                   seed=np.random.choice(range(100), 1)[0])
                for n, index in enumerate(index_list):
                    weight = None if sample_weight is None else sample_weight[
                        index[0]]
                    model.fit(feature.loc[index[0]],
                              label[index[0]],
                              sample_weight=weight)
                    cv_pred = pd.Series(model.predict(feature.loc[index[1]]),
                                        index=label[index[1]].index)
                    score.append(metrics(label[index[1]], cv_pred))
            cv_score = np.mean(score)
            if vaild_data is not None:
                cv_score_list.append(cv_score)
                if metrics_min:
                    cv_score_list.sort()
                    if cv_score_list[int(
                            len(cv_score_list) * 0.2)] >= cv_score:
                        cv_pred = pd.Series(model.predict(vaild_data[0]),
                                            index=vaild_data[1].index)
                        cv_score = metrics(vaild_data[1], cv_pred)
                    else:
                        logger.info(
                            f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}",
                            enter=False if i < iter_num else True)
                        continue
                else:
                    cv_score_list.sort(reverse=1)
                    if cv_score_list[int(
                            len(cv_score_list) * 0.2)] <= cv_score:
                        cv_pred = pd.Series(model.predict(vaild_data[0]),
                                            index=vaild_data[1].index)
                        cv_score = metrics(vaild_data[1], cv_pred)
                    else:
                        logger.info(
                            f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}",
                            enter=False if i < iter_num else True)
                        continue
            if metrics_min:
                if cv_score < scoring:
                    scoring = cv_score
                    self.best_params = self.HyperParameter.params.copy()
                    self.best_params_history[i] = {
                        'score': scoring,
                        'best_params': self.best_params.copy()
                    }
                    if save_model_dir is not None:
                        model.save_model(
                            os.path.join(save_model_dir,
                                         f"{save_model_name}_model.json"))
                        with open(
                                os.path.join(save_model_dir,
                                             f"{save_model_name}_params.json"),
                                'w') as f:
                            json.dump(best_params, f)
            else:
                if cv_score > scoring:
                    scoring = cv_score
                    self.best_params = self.HyperParameter.params.copy()
                    self.best_params_history[i] = {
                        'score': scoring,
                        'best_params': self.best_params.copy()
                    }
                    if save_model_dir is not None:
                        model.save_model(
                            os.path.join(save_model_dir,
                                         f"{save_model_name}_model.json"))
                        with open(
                                os.path.join(save_model_dir,
                                             f"{save_model_name}_params.json"),
                                'w') as f:
                            json.dump(best_params, f)
            logger.info(
                f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}",
                enter=False if i < iter_num else True)
        logger.info(f"XGBClassifier random search best score: {scoring:.4f}",
                    close=True,
                    time_mode=1)
        return self.best_params