def search(self, train_data, metrics, valid_data=None, iter_num=None, cv=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), save_model_dir=None, save_model_name=None): """model params search method. Args: train_data: A list of (X, y, sample_weight) tuple pairs to use as train sets. metrics: model metrics function. valid_data: A list of (X, y, sample_weight) tuple pairs to use as validation sets. iter_num: search count. cv: cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). save_model_dir: str, save model folder, only work with model='XGBClassifier' or 'XGBRegressor'. save_model_name: str, save model name prefix, only work with model='XGBClassifier' or 'XGBRegressor'. Returns: a best model params dict. Raises: params error. """ logger = Logger(name=self.params.name) logger.info(f"Start hyperparameter {self.params.method} search.") import warnings warnings.filterwarnings("ignore") if speedy: test_size = 1-round(min(speedy_param[0], len(train_data[1])*speedy_param[1])/len(train_data[1]), 2) if self.params.model_name=='XGBClassifier': self._xgb_weight(train_data[1]) if valid_data is not None: cv_score_list = [] if self.params.method=='grid': if iter_num is None: iter_num = self.hp.cardinality() else: iter_num = min(iter_num, self.hp.cardinality()) if iter_num is None: iter_num = 100 for i in range(1, iter_num+1): self.hp.update(self.best_params) self.params.model = self.params.model_init(**self.hp.params) score = [] if speedy: for _ in range(cv): index = train_test_split(train_data[0], train_data[1], test_size, seed=np.random.choice(range(100), 1)[0]) score.append(self._model_fit_predict(train_data, metrics, index, mode=1)) else: index_list = kfold(train_data[0], train_data[1], n_splits=cv, seed=np.random.choice(range(100), 1)[0]) for n, index in enumerate(index_list): score.append(self._model_fit_predict(train_data, metrics, index, mode=1)) cv_score = np.mean(score) if valid_data is not None: cv_score_list.append(cv_score) cv_score_list.sort() threshold = cv_score_list[int(len(cv_score_list)*(0.2 if metrics_min else 0.8))] if (metrics_min==True and threshold>=cv_score) or (metrics_min==False and threshold<=cv_score): cv_score = self._model_fit_predict(valid_data, metrics, index=None, mode=0) else: logger.info(f"Model {self.params.method} search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) continue if i==1: scoring = cv_score if (metrics_min==True and cv_score<=scoring) or (metrics_min==False and cv_score>=scoring): scoring = cv_score self.best_params = self.hp.params.copy() self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()} if self.params.model_name in ['XGBClassifier', 'XGBRegressor']: if save_model_dir is not None: if save_model_name is None: save_model_name = self.params.name model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f: json.dump(best_params, f) logger.info(f"Model {self.params.method} search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) logger.info(f"Model {self.params.method} search best score: {scoring:.4f}", close=True, time_mode=1) return self.best_params
def search(self, feature, label, vaild_data=None, sample_weight=None, metrics=auc_roc, loss='regression', iter_num=100, scoring=0.5, cv=5, cv_num=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), save_model_dir=None, save_model_name='lgb'): """LGBMRegressor model params search use RandomSearch method. Args: feature: pandas dataframe, model's feature. label: pandas series, model's label. vaild_data: A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. sample_weight: pd.Series or np.array, sample weight, shape is (n,). metrics: model metrics function, default is `la.metircs.auc_roc`. loss: LGBMRegressor param 'objective'. scoring: metrics error opt base line value. cv: cross validation fold. cv_num: if use speedy method, minimum cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). save_model_dir: str, save model folder. save_model_name: str, save model name prefix, "`lgb`_model.json" and "`lgb`_params.json". Returns: a best LGBMRegressor model params dict. Raises: params error. """ logger = Logger(name='lgb') logger.info(f"api is deprecated and will be removed in 1.5.0") logger.info(f"please use la.param_search.RandomSearch") import warnings warnings.filterwarnings("ignore") import lightgbm as lgb assert lgb.__version__>=__lightgbm_version__, f'lightgbm version should be >={__lightgbm_version__}.' if speedy: test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2) self.HyperParameter.Choice('objective', [loss]) if vaild_data is not None: cv_score_list = [] logger.info(f"Start LGBMRegressor hyperparameter random search.") for i in range(1, iter_num+1): self.HyperParameter.update() model = lgb.LGBMRegressor(**self.HyperParameter.params) score = [] if speedy: for _ in range(cv_num): index_list = train_test_split(feature, label, test_size=test_size, shuffle=True, seed=np.random.choice(range(100), 1)[0]) weight = None if sample_weight is None else sample_weight[index_list[0]] model.fit(feature.loc[index_list[0]], label[index_list[0]], sample_weight=weight) cv_pred = pd.Series(model.predict(feature.loc[index_list[1]]), index=label[index_list[1]].index) score.append(metrics(label[index_list[1]], cv_pred)) else: index_list = kfold(feature, label, n_splits=cv, shuffle=True, seed=np.random.choice(range(100), 1)[0]) for n, index in enumerate(index_list): weight = None if sample_weight is None else sample_weight[index[0]] model.fit(feature.loc[index[0]], label[index[0]], sample_weight=weight) cv_pred = pd.Series(model.predict(feature.loc[index[1]]), index=label[index[1]].index) score.append(metrics(label[index[1]], cv_pred)) cv_score = np.mean(score) if vaild_data is not None: cv_score_list.append(cv_score) if metrics_min: cv_score_list.sort() if cv_score_list[int(len(cv_score_list)*0.2)]>=cv_score: cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index) cv_score = metrics(vaild_data[1], cv_pred) else: logger.info(f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) continue else: cv_score_list.sort(reverse=1) if cv_score_list[int(len(cv_score_list)*0.2)]<=cv_score: cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index) cv_score = metrics(vaild_data[1], cv_pred) else: logger.info(f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) continue if metrics_min: if cv_score<scoring: scoring = cv_score self.best_params = self.HyperParameter.params.copy() self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()} if save_model_dir is not None: model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f: json.dump(best_params, f) else: if cv_score>scoring: scoring = cv_score self.best_params = self.HyperParameter.params.copy() self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()} if save_model_dir is not None: model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f: json.dump(best_params, f) logger.info(f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) logger.info(f"LGBMRegressor random search best score: {scoring:.4f}", close=True, time_mode=1) return self.best_params
def search(self, feature, label, vaild_data=None, sample_weight=None, metrics=auc_roc, loss='binary:logistic', iter_num=100, scoring=0.5, cv=5, cv_num=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), gpu_id=-1, save_model_dir=None, save_model_name='xgb'): """XGBClassifier model params search use RandomSearch method. Args: feature: pandas dataframe, model's feature. label: pandas series, model's label. vaild_data: A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. sample_weight: pd.Series or np.array, sample weight, shape is (n,). metrics: model metrics function, default is `la.metircs.auc_roc`. loss: XGBClassifier param 'objective'. scoring: metrics error opt base line value. cv: cross validation fold. cv_num: if use speedy method, minimum cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). gpu_id: int, use gpu device ordinal, -1 is not use gpu. save_model_dir: str, save model folder. save_model_name: str, save model name prefix, "`xgb`_model.json" and "`xgb`_params.json". Returns: a best XGBClassifier model params dict. Raises: params error. """ logger = Logger(name='xgb') logger.info(f"api is deprecated and will be removed in 1.5.0") logger.info(f"please use la.param_search.RandomSearch") import warnings warnings.filterwarnings("ignore") import xgboost as xgb assert xgb.__version__ >= __xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' if speedy: test_size = 1 - round( min(speedy_param[0], feature.shape[0] * speedy_param[1]) / feature.shape[0], 2) tree_method = ['gpu_hist'] if gpu_id > -1 else [ 'auto', 'exact', 'approx', 'hist' ] n_job = int(np.ceil(cpu_count() * 0.8)) weight_dict = Counter(label) if len(weight_dict) == 2: weight = int( np.ceil(weight_dict[min(weight_dict)] / weight_dict[max(weight_dict)])) else: weight_dict = {j: i for i, j in weight_dict.items()} weight = int( np.ceil(weight_dict[max(weight_dict)] / weight_dict[min(weight_dict)])) self.HyperParameter.Choice('n_jobs', [n_job]) self.HyperParameter.Choice('objective', [loss]) self.HyperParameter.Choice('tree_method', tree_method) self.HyperParameter.Choice('gpu_id', [gpu_id]) self.HyperParameter.Choice('scale_pos_weight', [1, weight]) if vaild_data is not None: cv_score_list = [] logger.info(f"Start XGBClassifier hyperparameter random search.") for i in range(1, iter_num + 1): self.HyperParameter.update() model = xgb.XGBClassifier(**self.HyperParameter.params) score = [] if speedy: for _ in range(cv_num): index_list = train_test_split(feature, label, test_size=test_size, shuffle=True, seed=np.random.choice( range(100), 1)[0]) weight = None if sample_weight is None else sample_weight[ index_list[0]] model.fit(feature.loc[index_list[0]], label[index_list[0]], sample_weight=weight) cv_pred = pd.Series(model.predict( feature.loc[index_list[1]]), index=label[index_list[1]].index) score.append(metrics(label[index_list[1]], cv_pred)) else: index_list = kfold(feature, label, n_splits=cv, shuffle=True, seed=np.random.choice(range(100), 1)[0]) for n, index in enumerate(index_list): weight = None if sample_weight is None else sample_weight[ index[0]] model.fit(feature.loc[index[0]], label[index[0]], sample_weight=weight) cv_pred = pd.Series(model.predict(feature.loc[index[1]]), index=label[index[1]].index) score.append(metrics(label[index[1]], cv_pred)) cv_score = np.mean(score) if vaild_data is not None: cv_score_list.append(cv_score) if metrics_min: cv_score_list.sort() if cv_score_list[int( len(cv_score_list) * 0.2)] >= cv_score: cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index) cv_score = metrics(vaild_data[1], cv_pred) else: logger.info( f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i < iter_num else True) continue else: cv_score_list.sort(reverse=1) if cv_score_list[int( len(cv_score_list) * 0.2)] <= cv_score: cv_pred = pd.Series(model.predict(vaild_data[0]), index=vaild_data[1].index) cv_score = metrics(vaild_data[1], cv_pred) else: logger.info( f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i < iter_num else True) continue if metrics_min: if cv_score < scoring: scoring = cv_score self.best_params = self.HyperParameter.params.copy() self.best_params_history[i] = { 'score': scoring, 'best_params': self.best_params.copy() } if save_model_dir is not None: model.save_model( os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open( os.path.join(save_model_dir, f"{save_model_name}_params.json"), 'w') as f: json.dump(best_params, f) else: if cv_score > scoring: scoring = cv_score self.best_params = self.HyperParameter.params.copy() self.best_params_history[i] = { 'score': scoring, 'best_params': self.best_params.copy() } if save_model_dir is not None: model.save_model( os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open( os.path.join(save_model_dir, f"{save_model_name}_params.json"), 'w') as f: json.dump(best_params, f) logger.info( f"Random search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i < iter_num else True) logger.info(f"XGBClassifier random search best score: {scoring:.4f}", close=True, time_mode=1) return self.best_params