def main(): cv_numiterations = params['num_iterations'] print(params) for i in range(cnt): train_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format(i)) train_lab = pd.read_csv(root_path + \ 'label_{}.csv'.format(i)).loc[:, 'label'].values # valid_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format((i + 1) % cnt)) # valid_lab = pd.read_csv(root_path + \ # 'label_{}.csv'.format((i + 1) % cnt)).loc[:, 'label'].values lgb_train = lgb.Dataset(train_fea, label=train_lab) # lgb_valid = lgb.Dataset(valid_fea, label=valid_lab, reference=lgb_train) print('cross-valid cnt={}/{}'.format(i + 1, cnt)) # solver = lgb.train(cv_params, lgb_train, valid_sets=[lgb_train], \ # valid_names=['train'], verbose_eval=True, \ # num_boost_round=cv_numiterations, \ # early_stopping_rounds=cv_early_stopping_round) # print(solver.feature_importance()) lgb.cv(params, lgb_train, \ verbose_eval=True, \ num_boost_round=cv_numiterations)
def fit(self, X, y): self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self._le = _LGBMLabelEncoder().fit(y) training_labels = self._le.transform(y) xgdmat = lgbm.Dataset(X, label=training_labels) #xgdmat.construct() self.param_map.update({'objective':'binary'}) #print('avant lgbm.cv') #print(self.param_map) # a verifier # if self.n_classes_ > 2: # self.param_map.update({'num_class':self.n_classes_}) # self.param_map.update({'objective':'multi:softprob'}) # Note: lgbm.cv reset the value of max_bin to 255 self.results = lgbm.cv(self.param_map, xgdmat, self.num_boost_round, self.folds, self.nfold, self.stratified, self.shuffle, self.metrics, self.fobj, self.feval, self.init_model, self.feature_name, self.categorical_feature, self.early_stopping_rounds, self.fpreproc, self.verbose_eval, self.show_stdv, self.seed, self.callbacks)
def train_all(tdf, ydf, Tdf): lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': ['auc', 'binary_logloss'], 'num_leaves': 31, 'min_deal_in_leaf':1500, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'lambda_l1':1.0, 'lambda_l2':1.0, 'bagging_freq': 1, 'learning_rate': 0.05, 'max_bin':255, 'verbose': 0 } lgtrain = lgb.Dataset(tdf, ydf) cvr = lgb_clf = lgb.cv( lgbm_params, lgtrain, num_boost_round=1400, #valid_sets=[lgtrain, lgvalid], #valid_names=['train','valid'], nfold=5, #early_stopping_rounds=40, verbose_eval=5 ) print(cvr.keys()) preds = lgb_clf.predict(Tdf) return preds
def fit(self, X_train, y_train): dtrain = lgb.Dataset(data=X_train, label=y_train) if self.verbose: bst = lgb.cv(self.params, dtrain, num_boost_round=10000, nfold=3, early_stopping_rounds=50, verbose_eval=50) else: bst = lgb.cv(self.params, dtrain, num_boost_round=10000, nfold=3, early_stopping_rounds=50) if self.maximize: best_rounds = int(np.argmax(np.array(bst[self.metric + '-mean']) - np.array(bst[self.metric + '-stdv'])) * 1.5) else: best_rounds = int(np.argmin(np.array(bst[self.metric + '-mean']) + np.array(bst[self.metric + '-stdv'])) * 1.5) if self.verbose: print('Best Iteration: {}'.format(best_rounds)) self.model = lgb.train(self.params, dtrain, best_rounds)
def step_xgb(params): fix_params(params) cv = lgb.cv(params, dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=6, seed=params['seed']) rounds = np.argmin(cv['binary_logloss-mean']) score = np.min(cv['binary_logloss-mean']) print(cname, score, rounds, params, self.now()) return dict(loss=score, status=STATUS_OK)
def lgb_cv(train_x, train_y, params, rounds, folds): start = time.clock() log(str(train_x.columns)) dtrain = lgb.Dataset(train_x, label=train_y) log('run cv: ' + 'round: ' + str(rounds)) res = lgb.cv(params, dtrain, rounds, nfold=folds, metrics=['eval_auc_f1', 'auc'], feval=eval_auc_f1, early_stopping_rounds=200, verbose_eval=5) elapsed = (time.clock() - start) log('Time used:' + str(elapsed) + 's') return len(res['feval-mean']), res['feval-mean'][len(res['feval-mean']) - 1], res['auc-mean'][len(res['auc-mean']) - 1]
def cv(self, params, num_boost_round, feval): dtrain = lgbm.Dataset(data=self.X, label=self.y) bst = lgbm.cv( params=params, train_set=dtrain, nfold=self.N, folds=self.skf.split(self.X, self.y), num_boost_round=num_boost_round, metrics=['auc'], feval=feval, early_stopping_rounds=50, verbose_eval=10, ) best_rounds = np.argmax(bst['gini-mean']) + 1 best_score = np.max(bst['gini-mean']) logging.info('best rounds : {0}'.format(best_rounds)) logging.info('best score : {0}'.format(best_score)) logging.info('lightGBM params : \n{0}'.format(params)) return best_rounds
def tune_n_iterations(df, params, n_splits=5, target="target", metrics=['multi_logloss'], RANDOM_STATE=42, verbose=1): """ find best number of iterations by cv score return cv_results """ skfold = StratifiedKFold(n_splits=n_splits, random_state=RANDOM_STATE) folds = skfold.split(X=df.drop(target, 1), y=df[target].values) trn_ds = lgb.Dataset(df.drop(target, 1), label=df[target]) cv_results = lgb.cv(params=params, train_set=trn_ds, folds=folds, metrics=metrics, verbose_eval=verbose) cv_results = pd.DataFrame(cv_results) return cv_results
def gbm_cv(x_train,y_train,params): # params = { # 'boosting_type': 'gbdt', # 'objective': 'regression', # 'min_child_weight':10, # 'metric': 'rmse', # # 'num_leaves': 8, # 'num_leaves': 4, # # 'learning_rate': 0.1, # 'learning_rate': 0.05, # 'feature_fraction': 0.8, # 'bagging_fraction': 0.8, # 'bagging_freq': 5, # 'verbose': 1, # 'lambda_l2': 1 # } train_data = lgb.Dataset(x_train, y_train) bst=lgb.cv(params,train_data, num_boost_round=5000, nfold=5, early_stopping_rounds=100) # print bst return bst
def rand_obj(space): """ Defines some of the random search parameter space and objective function """ subsample_dist = list(np.linspace(0.5, 1, 100)) if space['boosting_type'] == 'goss': space['subsample'] = 1.0 else: space['subsample'] = random.sample(subsample_dist, 1)[0] cv_result = lgb.cv(space, train_set, num_boost_round=NUM_BOOST_ROUNDS, nfold=N_FOLDS, early_stopping_rounds=EARLY_STOPPING_ROUNDS, metrics=['auc', 'binary', 'xentropy'], seed=SEED) best_score = np.max(cv_result['auc-mean']) loss = 1 - best_score n_estimators = int(np.argmax(cv_result['auc-mean']) + 1) return [loss, params, n_estimators]
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
def _get_score(self, params: Dict[str, Any], n: int) -> List[float]: """ Parameters ---------- params параметры дерева (которые мы ищем в этом классе) n количество кросс-валидаций для оценки гиперпараметров Returns ------- """ default_tree_params = { "boosting_type": "gbdt", "learning_rate": 1, "objective": "binary", "bagging_freq": 1, "bagging_fraction": 1, "feature_fraction": 1, "bagging_seed": 323, "n_jobs": 1, "verbosity": -1 } unite_params = {**params, **default_tree_params} score = [] for seed in range(n): cv_results = lgb.cv(params=unite_params, train_set=self._lgb_train, num_boost_round=1, nfold=5, metrics='auc', stratified=True, shuffle=True, seed=seed) score.append(cv_results["auc-mean"]) return score
def adj_eta(self, lgbdata, seed=66, nfold=5, early_stopping_rounds=100): best_params = {} for eta in [0.01, 0.015, 0.025, 0.05, 0.1]: self.config.params['learning_rate'] = eta cv_results = lgb.cv(self.config.params, lgbdata, seed=seed, nfold=nfold, num_boost_round=self.config.num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=0) if self.config.flag == -1: mean_merror = pd.Series( cv_results[self.config.params['metric'] + '-mean']).max() else: mean_merror = pd.Series( cv_results[self.config.params['metric'] + '-mean']).min() if mean_merror * self.config.flag < self.config.min_merror * self.config.flag: self.config.min_merror = mean_merror best_params['learning_rate'] = eta self.params.update(best_params)
def __call__(self, trial: trial_module.Trial) -> float: params = self._get_params(trial) # type: Dict[str, Any] dataset = copy.copy(self.dataset) callbacks = self._get_callbacks(trial) # type: List[Callable] eval_hist = lgb.cv( params, dataset, callbacks=callbacks, early_stopping_rounds=self.early_stopping_rounds, feval=self.feval, fobj=self.fobj, folds=self.cv, init_model=self.init_model, num_boost_round=self.n_estimators, ) # Dict[str, List[float]] values = eval_hist[ "{}-mean".format(self.eval_name) ] # type: List[float] best_iteration = len(values) # type: int trial.set_user_attr("best_iteration", best_iteration) trial_path = self.model_dir / "trial_{}".format(trial.number) trial_path.mkdir(exist_ok=True, parents=True) boosters = callbacks[0].boosters_ # type: ignore for i, b in enumerate(boosters): b.best_iteration = best_iteration b.free_dataset() booster_path = trial_path / "fold_{}.pkl".format(i) with booster_path.open("wb") as f: pickle.dump(b, f) return values[-1]
def select_best_auc_for_cat(self, func_cat=None, group='ID',cat_feats=None,lgb_params=None,df_id=None): # func_cat为_aggFeature中的纵向聚合类别特征函数 auc_list = [] df = self.data.copy() for feature in tqdm_notebook(cat_feats): # 0.1 计数特征 value_counts print(feature, '####################################################') ftr_ = df[[group, feature]].copy() a = ftr_[feature].value_counts() a = pd.DataFrame(list(zip(a.index, a.values)), columns=[feature, 'vcounts']) ftr_ = ftr_.merge(a, 'left', on=feature) # 0.2 排序特征 a = LabelEncoder() a_ = a.fit_transform(ftr_[feature]) ftr_['rank'] = a_ # 0.3 得到的vcounts和rank都当成类别特征,然后跑一下聚合特征加法和特征减法,进行单特征评测 new_df = func_cat(ftr_, group=group, feats=[feature,'vcounts', 'rank']) fs = FeatureSelector(new_df.drop([group], 1)) # 把 FeatureSelector 加载进来别忘了 fs.identify_collinear(correlation_threshold=0.98) new_df.drop(fs.ops['collinear'], axis=1, inplace=True) if df_id is not None: new_df = df_id.merge(new_df, 'left', on=group) y = new_df['LABEL'].copy() X = new_df.drop([group, 'LABEL'], axis=1).copy() lgb_data = lgb.Dataset(X, y) model_cv = lgb.cv( lgb_params, lgb_data, num_boost_round=2000, nfold=5, stratified=False, ########stratified回归 early_stopping_rounds=100, verbose_eval=50, show_stdv=True) auc_list.append((model_cv['auc-mean'][-1], feature)) auc_list.sort(reverse=True) return auc_list
def objective(params, n_folds = N_FOLDS): global ITERATION ITERATION += 1 subsample = params['boosting_type'].get('subsample', 1.0) params['boosting_type'] = params['boosting_type']['boosting_type'] params['subsample'] = subsample for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']: params[parameter_name] = int(params[parameter_name]) start = timer() cv_results = lgb.cv(params, train_set, num_boost_round = 10000, nfold = n_folds, early_stopping_rounds = 100, metrics = 'auc', seed = 50) run_time = timer() - start best_score = np.max(cv_results['auc-mean']) loss = 1 - best_score n_estimators = int(np.argmax(cv_results['auc-mean']) + 1) of_connection = open(out_file, 'a') writer = csv.writer(of_connection) writer.writerow([loss, params, ITERATION, n_estimators, run_time]) return {'loss': loss, 'params': params, 'iteration': ITERATION, 'estimators': n_estimators, 'train_time': run_time, 'status': STATUS_OK}
def test_fpreproc(self): def preprocess_data(dtrain, dtest, params): train_data = dtrain.construct().get_data() test_data = dtest.construct().get_data() train_data[:, 0] += 1 test_data[:, 0] += 1 dtrain.label[-5:] = 3 dtest.label[-5:] = 3 dtrain = lgb.Dataset(train_data, dtrain.label) dtest = lgb.Dataset(test_data, dtest.label, reference=dtrain) params['num_class'] = 4 return dtrain, dtest, params X, y = load_iris(True) dataset = lgb.Dataset(X, y, free_raw_data=False) params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) self.assertIn('multi_logloss-mean', results) self.assertEqual(len(results['multi_logloss-mean']), 10)
def lgb_eval(num_leaves, max_depth, lambda_l2,lambda_l1, min_child_samples, bagging_fraction, feature_fraction, min_child_weight): params = { "objective": "binary", "metric": "auc", "num_leaves": int(num_leaves), "max_depth": int(max_depth), "lambda_l2": lambda_l2, "lambda_l1": lambda_l1, "num_threads": 32, "min_child_samples": int(min_child_samples), "min_child_weight": min_child_weight, "learning_rate": 0.05, "bagging_fraction": bagging_fraction, "feature_fraction": feature_fraction, "seed": 2020, "verbosity": -1 } train_df = lgb.Dataset(train_X, train_set.label) scores = lgb.cv(params, train_df, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=False, nfold=3)['auc-mean'][-1] return scores
def test_one_param(df, label, params): iterations = [] kf = KFold(n_splits=5, shuffle=True, random_state=0) for i, (train_index, test_index) in enumerate(kf.split(df)): x_train, x_test, y_train, y_test = \ df.loc[train_index, :], df.loc[test_index, :], \ label.loc[train_index, :], label.loc[test_index, :] lgb_train = lgb.Dataset(x_train, y_train["label"].values, silent=True, weight=y_train["weight"].values) lgb_test = lgb.Dataset(x_test, y_test["label"].values, silent=True, weight=y_test["weight"].values) cv_results = lgb.cv(params, lgb_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, early_stopping_rounds=50, seed=0) print(cv_results) params['num_iterations'] = len(cv_results['multi_logloss-mean']) iterations.append(params['num_iterations']) bst = lgb.train( params, lgb_train, valid_sets=lgb_test, ) return iterations
def lgbm_cv(y, lgtrain): print("Light Gradient Boosting Classifier: ") lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class': len(set(y)), 'metric': ['multi_logloss'], "learning_rate": 0.05, "num_leaves": 80, "max_depth": 6, "feature_fraction": 0.70, "bagging_fraction": 0.75, "reg_alpha": 0.15, "reg_lambda": 0.15, "min_child_weight": 0, "verbose": 0 } modelstart = time.time() # Find Optimal Parameters / Boosting Rounds lgb_cv = lgb.cv(params=lgbm_params, train_set=lgtrain, num_boost_round=2000, stratified=True, nfold=5, verbose_eval=100, seed=23, early_stopping_rounds=75) loss = lgbm_params["metric"][0] optimal_rounds = np.argmin(lgb_cv[str(loss) + '-mean']) best_cv_score = min(lgb_cv[str(loss) + '-mean']) print("\nOptimal Round: {}\nOptimal Score: {} + {}".format( optimal_rounds, best_cv_score, lgb_cv[str(loss) + '-stdv'][optimal_rounds])) return lgbm_params, optimal_rounds, best_cv_score
def cv(self, df_train): if self.d_train == None: self.init_cv(df_train) params = copy.deepcopy(self.params) cv_result = lgb.cv(params, self.d_train, num_boost_round=3000, nfold=5, stratified=True, \ shuffle=True, init_model=None, feature_name='auto', \ categorical_feature='auto', early_stopping_rounds=300, \ fpreproc=None, verbose_eval=True, show_stdv=True, seed=1234, callbacks=None) self.print_cv_result(cv_result) # best results shown as follow, finally, choose max_depth=4, leavs=16, lr=0.02, ff=0.95 # best 6, 48, 555, lr=0.02 0.193635 # best 6, 32, 564, lr=0.02 0.193498 # best 4, 32 or 16, 1034, lr=0.02 0.193194 # best 4, 32 or 16, 1016, ff=0.95 lr=0.02 0.193161 # best 4, 16, 982, ff=0.95, lr=0.02, full candidates, features 0.191541 # best 4, 16, 1100, ff=0.9, lr=0.02, full candidates, features 0.191541 # best 4, 16, 843, ff=0.6, lr=0.02, full + extra user_cat_20 detail features, 0.19145244312795784 return cv_result
def LGB(train, test): train_x = train.drop(['orderType', 'userid'], axis=1) train_y = train.orderType.values print(train_x.shape) print(len(train_y)) import lightgbm as lgb param = {} param['task'] = 'train' param['boosting_type'] = 'gbdt' param['objective'] = 'binary' param['metric'] = 'auc' param['min_sum_hessian_in_leaf'] = 0.1 param['learning_rate'] = 0.01 param['verbosity'] = 2 param['tree_learner'] = 'feature' param['num_leaves'] = 128 param['feature_fraction'] = 0.7 param['bagging_fraction'] = 0.7 param['bagging_freq'] = 1 param['num_threads'] = 16 dtrain = lgb.Dataset(train_x, label=train_y) res1gb = lgb.cv(param, dtrain, 5500, nfold=5, early_stopping_rounds=100, verbose_eval=20) ro = len(res1gb['auc-mean']) model = lgb.train(param, dtrain, ro, valid_sets=[dtrain], verbose_eval=500) test_x = test[train_x.columns.values] y = model.predict(test_x) test['orderType'] = y test[['userid', 'orderType' ]].to_csv('../result/lgb_baseline' + str(version) + '.csv', index=False)
def lgb_modelling(train, test, option_params, ID, y): #lightGBM用のデータセットに変換 X_train, train_lgb = make_dataset(train, ID, y) X_test, _ = make_dataset(test, ID, y) #ハイパラの選択肢を作る params = list(ParameterGrid(option_params)) #GridSearchを行う min_score = np.inf for param in tqdm(params): result = lgb.cv(params=param, num_boost_round=1000, train_set=train_lgb, early_stopping_rounds=10, stratified=True, verbose_eval=False) metric = param['metric'] score = min(result[metric + '-mean']) print(param, score) if min_score > score: min_score = score min_score_iteration = len(result[metric + '-mean']) best_hyper_param = param #このハイパラでもう一度学習予測 bst = lgb.train(params=best_hyper_param, num_boost_round=min_score_iteration, train_set=train_lgb, verbose_eval=False) #テストデータに対して予測をする pred_test = bst.predict(X_test) pred_test = np.exp(pred_test) pred_test[pred_test < 0] == 0 return pred_test, min_score, bst
def objective(space_params): #cast integer params from float to int for param in integer_params: space_params[param] = int(space_params[param]) #extract nested conditional parameters if space_params['boosting']['boosting'] == 'goss': top_rate = space_params['boosting'].get('top_rate') other_rate = space_params['boosting'].get('other_rate') #0 <= top_rate + other_rate <= 1 top_rate = max(top_rate, 0) top_rate = min(top_rate, 0.5) other_rate = max(other_rate, 0) other_rate = min(other_rate, 0.5) space_params['top_rate'] = top_rate space_params['other_rate'] = other_rate subsample = space_params['boosting'].get('subsample', 1.0) space_params['boosting'] = space_params['boosting']['boosting'] space_params['subsample'] = subsample space_params['feature_pre_filter'] = False train = lgb.Dataset(data, labels) #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS cv_results = lgb.cv(space_params, train, nfold=N_FOLDS, stratified=False, early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42) best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse #for classification, comment out the line above and uncomment the line below: #best_loss = 1 - cv_results['auc-mean'][-1] #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean' return {'loss': best_loss, 'status': STATUS_OK}
def cv(self, X=None, y=None, k_fold=5, dataset_train=None): logger.warning( "Warning: Running GBM cross-validation. This is currently unstable." ) try_import_lightgbm() import lightgbm as lgb if dataset_train is None: dataset_train, _ = self.generate_datasets(X_train=X, Y_train=y) gc.collect() params = copy.deepcopy(self.params) eval_metric = self.get_eval_metric() # TODO: Either edit lgb.cv to return models / oof preds or make custom implementation! cv_params = { 'params': params, 'train_set': dataset_train, 'num_boost_round': self.num_boost_round, 'nfold': k_fold, 'early_stopping_rounds': 150, 'verbose_eval': 1000, 'seed': 0, } if type(eval_metric) != str: cv_params['feval'] = eval_metric cv_params['params']['metric'] = 'None' else: cv_params['params']['metric'] = eval_metric if self.problem_type == REGRESSION: cv_params['stratified'] = False logger.log(15, 'Current parameters:') logger.log(15, params) eval_hist = lgb.cv( **cv_params ) # TODO: Try to use customer early stopper to enable dart best_score = eval_hist[self.eval_metric_name + '-mean'][-1] logger.log(15, 'Best num_boost_round: %s ', len(eval_hist[self.eval_metric_name + '-mean'])) logger.log(15, 'Best CV score: %s' % best_score) return best_score
def adj_min_child_weight(self, seed=66, nfold=5, early_stopping_rounds=100): best_params = {} lgb_train = lgb.Dataset(self.df_train[self.best_feature].values, self.df_train[self.label_name].values) for min_child_weight in [1, 3, 5, 7]: self.params['min_child_weight'] = min_child_weight cv_results = lgb.cv(self.params, lgb_train, seed=seed, nfold=nfold, num_boost_round=self.num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=0) mean_merror = pd.Series(cv_results['l1-mean']).min() if mean_merror <= self.min_merror: self.min_merror = mean_merror best_params['min_child_weight'] = min_child_weight self.params.update(best_params)
def score_feature_selection(df=None, train_features=None, cat_feats=None, target=None): # Fit LightGBM dtrain = lgb.Dataset(df[train_features], target, free_raw_data=False, silent=True) lgb_params = { 'objective': 'binary', 'boosting_type': 'gbdt', 'learning_rate': .1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'num_leaves': 31, 'max_depth': -1, 'seed': 13, 'n_jobs': 4, 'min_split_gain': .00001, 'reg_alpha': .00001, 'reg_lambda': .00001, 'metric': 'auc' } # Fit the model hist = lgb.cv(params=lgb_params, train_set=dtrain, num_boost_round=2000, categorical_feature=cat_feats, nfold=5, stratified=True, shuffle=True, early_stopping_rounds=50, verbose_eval=0, seed=17) # Return the last mean / std values return hist['auc-mean'][-1], hist['auc-stdv'][-1]
def num_estimators(self, X, y, n_folds=3, learning_rate=0.3, n_estimators=500, early_stop=20, **kwargs): """ Setting up a hight learning rate and using early stopping to find the number of estimators. """ params = {} params['learning_rate'] = learning_rate # overwrite if something is passed as kwargs for k, v in params.items(): if k in kwargs.keys(): params[k] = kwargs[k] params['verbose'] = -1 dtrain = lgb.Dataset(X, label=y, feature_name=self.feature_names, categorical_feature=self.categorical_feat, free_raw_data=False) self.data = dtrain cv_result = lgb.cv(params, dtrain, nfold=3, metrics=self.metric, num_boost_round=n_estimators, early_stopping_rounds=20, stratified=False) self.params['n_estimators'] = len(cv_result[self.metric + '-mean']) self.params['learning_rate'] = learning_rate return self
def sample(dtrain, dtest): predictors =[] dummies = [] target = None # 01. train set and test set train_data = lgb.Dataset(dtrain[predictors], label=dtrain[target], feature_name=list(dtrain[predictors].columns), categorical_feature=dummies) test_data = lgb.Dataset(dtest[predictors], label=dtest[target], feature_name=list(dtest[predictors].columns), categorical_feature=dummies) # // 02. parameters param = { # num_leaves = 2^(max_depth) 2 **(max_depth) 'max_depth': 6, 'num_leaves': 64, 'learning_rate': 0.03, 'scale_pos_weight': 1, 'num_threads': 40, 'objective': 'binary', # Bagging参数:bagging_fraction+bagging_freq(必须同时设置)、feature_fraction 'bagging_fraction': 0.7, 'bagging_freq': 1, 'min_sum_hessian_in_leaf': 100 } # 样本分布非平衡数据集 param['is_unbalance'] = 'True' param['metric'] = 'auc' param['stratified'] = 'True' # // 03. cv and train bst = lgb.cv(param, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30) estimators = lgb.train(param, train_data, num_boost_round=len(bst['auc-mean'])) # // 04. test predict ypred = estimators.predict(dtest[predictors])
def lgbm_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_alpha, reg_lambda, min_split_gain, min_child_weight, min_data_in_leaf): params = dict() params["learning_rate"] = 0.01 # params["silent"] = True params['device'] = 'gpu' # params["nthread"] = 16 params['objective'] = 'regression' params['seed'] = 326, params["num_leaves"] = int(num_leaves) params['colsample_bytree'] = max(min(colsample_bytree, 1), 0) params['subsample'] = max(min(subsample, 1), 0) params['max_depth'] = int(max_depth) params['reg_alpha'] = max(reg_alpha, 0) params['reg_lambda'] = max(reg_lambda, 0) params['min_split_gain'] = min_split_gain params['min_child_weight'] = min_child_weight params['min_data_in_leaf'] = int(min_data_in_leaf) params['verbose'] = -1 folds = get_folds(df=TRAIN_DF['totals.pageviews_MEAN'].reset_index(), n_splits=NUM_FOLDS) clf = lightgbm.cv( params=params, train_set=lgbm_train, metrics=['rmse'], nfold=NUM_FOLDS, folds=folds, num_boost_round=10000, # early stopありなのでここは大きめの数字にしてます early_stopping_rounds=200, verbose_eval=100, seed=47, ) gc.collect() return -clf['rmse-mean'][-1]
def objective(params): global cnt, new_max params = { 'num_leaves': int(params['num_leaves']), 'max_bin': int(params['max_bin']), 'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']), 'learning_rate': '{:.3f}'.format(params['learning_rate']), 'lambda_l2': int(params['lambda_l2']), } cv_data = lg.cv(params, data, num_boost_round=4000, nfold=5, seed=2332, stratified=False, early_stopping_rounds=5, metrics='rmse') score = cv_data['rmse-mean'][-1] # saving score to a file if score < new_max: new_max = score print("############### Score: {0}".format(score)) print("############### Prms: ", params) print('..........................') with open("cv_lightgbm.txt", "a") as myfile: myfile.write(f''' ############### Score: {cnt} ############### Score: {score} ############### Prms:{params} \n ''') cnt += 1 return { 'loss': score, 'status': STATUS_OK, 'eval_time': time.time(), }
def lgbfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50, dtest=None): starttime = datetime.datetime.now() if useTrainCV: lgb_param = alg.get_params() ltrain = lgb.Dataset(dtrain[predictors].values, label=dtrain['target'].values) # ltest = lgb.Dataset(dtest[predictors].values) cvresult = lgb.cv(lgb_param, ltrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds, verbose_eval=False, metrics='auc') alg.set_params(n_estimators=len(cvresult['auc-mean'])) print("cv score:", cvresult['auc-mean'][-1]) #fit alg.fit(dtrain[predictors], dtrain['target'], eval_metric='auc') #prediction on train set dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] endtime = datetime.datetime.now() #output print("accuracy: ", metrics.accuracy_score(dtrain['target'].values, dtrain_predictions)) print("AUC score:", metrics.roc_auc_score(dtrain['target'], dtrain_predprob)) print("time spent: ", (endtime - starttime).seconds, "s")
def objective(hyperparameters, iteration): """Objective function for grid and random search. Returns the cross validation score from a set of hyperparameters.""" # Number of estimators will be found using early stopping if 'n_estimators' in hyperparameters.keys(): del hyperparameters['n_estimators'] # Perform n_folds cross validation cv_results = lgb.cv(hyperparameters, train_set, num_boost_round=10000, nfold=N_FOLDS, early_stopping_rounds=100, metrics='auc', seed=42) # results to retun score = cv_results['auc-mean'][-1] estimators = len(cv_results['auc-mean']) hyperparameters['n_estimators'] = estimators return [score, hyperparameters, iteration]
def train_model(train_df, train_y, test_df): print("Training on: {}".format(train_df.shape, train_y.shape)) train_lgb = lgb.Dataset(train_df, train_y) model = lgb.train(params, train_lgb, num_boost_round=ROUNDS) preds = model.predict(test_df) print("Features importance...") gain = model.feature_importance('gain') ft = pd.DataFrame({ 'feature': model.feature_name(), 'split': model.feature_importance('split'), 'gain': 100 * gain / gain.sum() }).sort_values('gain', ascending=False) print(ft.head(25)) score = lgb.cv(params, train_lgb, metrics="l2_root") ''' plt.figure() ft[['feature','gain']].head(25).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(10, 20)) plt.gcf().savefig('features_importance.png') ''' return model, preds, score
def cv(self, X, y, nfold=5, num_round=8000, early_stopping=10, verbose=True, params={}): trainParam = self.params trainParam.update(params) trainData = lgb.Dataset(X, label=y, feature_name=self.feaName, categorical_feature=self.cateFea) result = lgb.cv(trainParam, trainData, feature_name=self.feaName, categorical_feature=self.cateFea, num_boost_round=num_round, nfold=nfold, early_stopping_rounds=early_stopping, verbose_eval=verbose) return result
def adj_fraction(self, seed=66, nfold=5, early_stopping_rounds=100): best_params = {} lgb_train = lgb.Dataset(self.df_train[self.best_feature].values, self.df_train[self.label_name].values) for feature_fraction in [0.6, 0.7, 0.8, 0.9, 1]: for bagging_fraction in [0.6, 0.7, 0.8, 0.9, 1]: self.params['feature_fraction'] = feature_fraction self.params['bagging_fraction'] = bagging_fraction cv_results = lgb.cv( self.params, lgb_train, seed=seed, nfold=nfold, num_boost_round=self.num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=0) mean_merror = pd.Series(cv_results['l1-mean']).min() if mean_merror <= self.min_merror: self.min_merror = mean_merror best_params['feature_fraction'] = feature_fraction best_params['bagging_fraction'] = bagging_fraction self.params.update(best_params)
def adj_bin_leafdata(self, seed=66, nfold=5, early_stopping_rounds=100): best_params = {} lgb_train = lgb.Dataset(self.df_train[self.best_feature].values, self.df_train[self.label_name].values) for max_bin in range(100, 255, 10): for min_data_in_leaf in range(10, 200, 10): self.params['max_bin'] = max_bin self.params['min_data_in_leaf'] = min_data_in_leaf cv_results = lgb.cv( self.params, lgb_train, seed=seed, nfold=nfold, num_boost_round=self.num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=0) mean_merror = pd.Series(cv_results['l1-mean']).min() if mean_merror <= self.min_merror: self.min_merror = mean_merror best_params['max_bin'] = max_bin best_params['min_data_in_leaf'] = min_data_in_leaf self.params.update(best_params)
def test_cv(self): lgb_train, _ = template.test_template(return_data=True) lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
'learning_rate': learning_rate, 'max_depth': max_depth, 'metric':'auc', 'min_data_in_leaf': min_data_in_leaf, 'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf, 'num_leaves': num_leaves, 'n_jobs': 30, 'tree_learner': 'serial', 'objective': 'binary', 'verbosity': -1 } print(">>>>",i,"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<") print(param) model = lgb.cv(param, trn_data, 1000000, feature_name=X.columns.tolist(), verbose_eval=500, early_stopping_rounds = 4000, nfold=4) num_leaves_list.append(num_leaves) learning_rate_list.append(learning_rate) feature_fraction_list.append(feature_fraction) bagging_fraction_list.append(bagging_fraction) #max_bin_list.append(max_bin) scale_pos_weight_list.append(scale_pos_weight) max_depth_list.append(max_depth) bagging_freq_list.append(bagging_freq) min_data_in_leaf_list.append(min_data_in_leaf) min_sum_hessian_in_leaf_list.append(min_sum_hessian_in_leaf) nround_p.append(len(model['auc-mean'])) auc_cv_mean.append(model['auc-mean'][len(model['auc-mean'])-1]) auc_cv_std.append(model['auc-stdv'][len(model['auc-mean'])-1])
def run_cv(): # read train/test data df_train = read_train_data() X_train = df_train[COMMENT_COL].values id_train = df_train[ID_COL].values.tolist() y_true = df_train[label_candidates].values df_test = read_test_data() X_test = df_test[COMMENT_COL].values id_test = df_test[ID_COL].values.tolist() # extract n-gram word level feature print('Extracting n-gram features') extractor_word = get_extractor_word() extractor_word.fit(pd.concat((df_train.loc[:, COMMENT_COL], df_test.loc[:, COMMENT_COL]))) X_train_word = conduct_transform(extractor_word, X_train) X_test_word = conduct_transform(extractor_word, X_test) print('Train word data shape : {0}'.format(X_train_word.shape)) print('Test word data shape : {0}'.format(X_test_word.shape)) # extract n-gram char level feature extractor_char = get_extractor_char() extractor_char.fit(pd.concat((df_train.loc[:, COMMENT_COL], df_test.loc[:, COMMENT_COL]))) X_train_char = conduct_transform(extractor_char, X_train) X_test_char = conduct_transform(extractor_char, X_test) print('Train char data shape : {0}'.format(X_train_char.shape)) print('Test char data shape : {0}'.format(X_test_char.shape)) # combine word and char X_train_word_char = hstack([X_train_word, X_train_char]) X_test_word_char = hstack([X_test_word, X_test_char]) X_train_word_char = X_train_word_char.tocsr() X_test_word_char = X_test_word_char.tocsr() print('Train word char data shape : {0}'.format(X_train_word_char.shape)) print('Test word char data shape : {0}'.format(X_test_word_char.shape)) # get idx of trn/val for each fold print('Getting array index of train/valid for each fold') idx_trn_val = get_idx_trn_val(id_train) # preds on test/valid preds_test = np.zeros((X_test_word_char.shape[0], n_classes)) preds_valid = np.zeros((X_train_word_char.shape[0], n_classes)) # cv and train/predict for label_col, label_name in enumerate(label_candidates): print('\nlabel column : {0}'.format(label_col)) print('label name : {0}'.format(label_name)) # cv best boost rounds train_set = lgbm.Dataset(data=X_train_word_char, label=y_true[:, label_col]) params = get_gbdt_params(label_name) print('lgbm params : {0}'.format(params)) hist = lgbm.cv( params=params, train_set=train_set, folds=idx_trn_val, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, metrics=['auc'], verbose_eval=verbose_eval, ) bst_boost_rounds = np.argmax(hist['auc-mean']) + 1 bst_acc_score = np.max(hist['auc-mean']) print('label column : {0}'.format(label_col)) print('label name : {0}'.format(label_name)) print('best boost rounds {0}, best auc score {1}'.format(bst_boost_rounds, bst_acc_score)) # oof train and predict for fold, (idx_trn, idx_val) in enumerate(idx_trn_val): print('fold {0}'.format(fold)) print(' train') dat_trn = lgbm.Dataset(data=X_train_word_char[idx_trn, :], label=y_true[idx_trn, label_col]) model = lgbm.train( params=params, train_set=dat_trn, num_boost_round=bst_boost_rounds, verbose_eval=verbose_eval, ) print(' predict') preds_valid[idx_val, label_col] = model.predict( data=X_train_word_char[idx_val,:], num_iteration=bst_boost_rounds) preds_test[:,label_col] = model.predict(data=X_test_word_char, num_iteration=bst_boost_rounds) / K del model # ensemble cv score score = roc_auc_score(y_true=y_true, y_score=preds_valid, average='macro') print('\ncv score : {0}'.format(score)) # divide data for ensemble for fold, (_, idx_val) in enumerate(idx_trn_val): preds_valid_fold = preds_valid[idx_val,:].T df_preds_val = pd.DataFrame() idx_val_set = set(idx_val) df_preds_val[ID_COL] = [ id for idx, id in enumerate(id_train) if idx in idx_val_set ] for idx, label in enumerate(label_candidates): df_preds_val[label] = preds_valid_fold[idx] df_preds_val.to_csv('../data/output/preds/gbdt/{0}fold_valid.csv'.format(fold), index=False) # record ensemble result preds_test = preds_test.T df_preds_test = pd.DataFrame() df_preds_test[ID_COL] = id_test for idx, label in enumerate(label_candidates): df_preds_test[label] = preds_test[idx] df_preds_test.to_csv('../data/output/preds/gbdt/avg_submit.csv', index=False)
'metric': 'rmse', 'num_leaves': num_leaves, 'learning_rate': learning_rate, 'feature_fraction': feature_fraction, 'bagging_fraction': bagging_fraction, 'max_bin': max_bin, 'max_depth': max_depth, 'bagging_freq': bagging_freq, 'verbose': 0 } print(">>>>",i,"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<") print(params) model = lgb.cv(params, lgb_train, num_boost_round=100000, early_stopping_rounds=50, feature_name=all_data.columns.tolist(), nfold=4, stratified=False) num_leaves_list.append(num_leaves) learning_rate_list.append(learning_rate) feature_fraction_list.append(feature_fraction) bagging_fraction_list.append(bagging_fraction) max_bin_list.append(max_bin) max_depth_list.append(max_depth) bagging_freq_list.append(bagging_freq) nround_p.append(len(model['rmse-mean'])) rmse_cv_mean.append(model['rmse-mean'][len(model['rmse-mean'])-1]) rmse_cv_std.append(model['rmse-stdv'][len(model['rmse-mean'])-1]) i = i + 1 grid = pd.DataFrame({
## Training num_round = 10 bst = lgb.train(param, train_data, num_round, valid_sets=[test_data]) bst = lgb.train(param, train_data, num_round) bst.save_model('tmp.model') # The trained model can also be dumped to JSON format: json_model = bst.dump_model() print(json_model) # 好长... ################################################################## ## Booster bst = lgb.Booster(model_file='tmp.model') # 这里把加载文件设置为参数了 ################################################################## ## CV # Training with 5-fold CV: num_round = 10 lgb.cv(param, train_data, num_round, nfold=5) ################################################################## ## Early Stopping # If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. # Early stopping requires at least one set in valid_sets. If there is more than one, it will use all of them: bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, early_stopping_rounds=10) bst.save_model('model.txt', num_iteration=bst.best_iteration) # The model will train until the validation score stops improving. # Validation error needs to improve at least every early_stopping_rounds to continue training. # If early stopping occurs, the model will have an additional field: bst.best_iteration. # Note that train() will return a model from the last iteration, not the best one. # And you can set num_iteration=bst.best_iteration when saving model. # This works with both metrics to minimize (L2, log loss, etc.) and to maximize (NDCG, AUC). # Note that if you specify more than one evaluation metric, all of them will be used for early stopping.
preds_test = np.zeros((test_submit.shape[0], n_classes)) preds_valid = np.zeros((y_true.shape[0], n_classes)) params = get_gbdt_params() for label_col, label_name in enumerate(label_candidates): print('label column : {0}'.format(label_col)) print('label name : {0}'.format(label_name)) # cv best boost rounds train_set = lgbm.Dataset(data=preds_score, label=y_true[:, label_col]) hist = lgbm.cv( params=params, train_set=train_set, folds=idx_trn_val, num_boost_round=2000, early_stopping_rounds=50, metrics=['auc'], verbose_eval=10, ) bst_boost_rounds = np.argmax(hist['auc-mean']) + 1 bst_acc_score = np.max(hist['auc-mean']) print('label column : {0}'.format(label_col)) print('label name : {0}'.format(label_name)) print('best boost rounds {0}, best auc score {1}'.format(bst_boost_rounds, bst_acc_score)) # oof train and predict for fold, (idx_trn, idx_val) in enumerate(idx_trn_val): print('oof train for label : {0}'.format(label_name)) dat_trn = lgbm.Dataset(data=preds_score[idx_trn, :], label=y_true[idx_trn, label_col]) model = lgbm.train(
def run_cv(): # read train/test data df_train = read_train_data() id_train = df_train[ID_COL].values.tolist() y_true = df_train[label_candidates].values df_test = read_test_data() id_test = df_test[ID_COL].values.tolist() get_indicators_and_clean_comments(df=df_train) get_indicators_and_clean_comments(df=df_test) num_features = [f_ for f_ in df_train.columns if f_ not in ["comment_text", "clean_comment", "id", "remaining_chars", 'has_ip_address'] + label_candidates] skl = MinMaxScaler() train_num_features = csr_matrix(skl.fit_transform(df_train[num_features])) test_num_features = csr_matrix(skl.fit_transform(df_test[num_features])) # Get TF-IDF features train_text = df_train['clean_comment'] test_text = df_test['clean_comment'] all_text = pd.concat([train_text, test_text]) # First on real words word_vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1, 2), max_features=20000) word_vectorizer.fit(all_text) train_word_features = word_vectorizer.transform(train_text) test_word_features = word_vectorizer.transform(test_text) del word_vectorizer gc.collect() # Now use the char_analyzer to get another TFIDF char_vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents='unicode', tokenizer=char_analyzer, analyzer='word', ngram_range=(1, 1), max_features=50000) char_vectorizer.fit(all_text) train_char_features = char_vectorizer.transform(train_text) test_char_features = char_vectorizer.transform(test_text) del char_vectorizer gc.collect() X_train_word_char = hstack([train_char_features, train_word_features, train_num_features]).tocsr() X_test_word_char = hstack([test_char_features, test_word_features, test_num_features]).tocsr() # get idx of trn/val for each fold print('Getting array index of train/valid for each fold') idx_trn_val = get_idx_trn_val(id_train) # preds on test/valid preds_test = np.zeros((X_test_word_char.shape[0], n_classes)) preds_valid = np.zeros((X_train_word_char.shape[0], n_classes)) # cv and train/predict for label_col, label_name in enumerate(label_candidates): print('\nlabel column : {0}'.format(label_col)) print('label name : {0}'.format(label_name)) # cv best boost rounds train_set = lgbm.Dataset(data=X_train_word_char, label=y_true[:, label_col]) params = get_gbdt_params(label_name) print('lgbm params : {0}'.format(params)) hist = lgbm.cv( params=params, train_set=train_set, folds=idx_trn_val, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, metrics=['auc'], verbose_eval=verbose_eval, ) bst_boost_rounds = np.argmax(hist['auc-mean']) + 1 bst_acc_score = np.max(hist['auc-mean']) print('label column : {0}'.format(label_col)) print('label name : {0}'.format(label_name)) print('best boost rounds {0}, best auc score {1}'.format(bst_boost_rounds, bst_acc_score)) # oof train and predict for fold, (idx_trn, idx_val) in enumerate(idx_trn_val): print('fold {0}'.format(fold)) print(' train') dat_trn = lgbm.Dataset(data=X_train_word_char[idx_trn, :], label=y_true[idx_trn, label_col]) model = lgbm.train( params=params, train_set=dat_trn, num_boost_round=bst_boost_rounds, verbose_eval=verbose_eval, ) print(' predict') preds_valid[idx_val, label_col] = model.predict( data=X_train_word_char[idx_val,:], num_iteration=bst_boost_rounds) preds_test[:,label_col] = model.predict(data=X_test_word_char, num_iteration=bst_boost_rounds) / K del model # ensemble cv score score = roc_auc_score(y_true=y_true, y_score=preds_valid, average='macro') print('\ncv score : {0}'.format(score)) # divide data for ensemble for fold, (_, idx_val) in enumerate(idx_trn_val): preds_valid_fold = preds_valid[idx_val,:].T df_preds_val = pd.DataFrame() idx_val_set = set(idx_val) df_preds_val[ID_COL] = [ id for idx, id in enumerate(id_train) if idx in idx_val_set ] for idx, label in enumerate(label_candidates): df_preds_val[label] = preds_valid_fold[idx] df_preds_val.to_csv('../data/output/preds/gbdt/{0}fold_valid.csv'.format(fold), index=False) # record ensemble result preds_test = preds_test.T df_preds_test = pd.DataFrame() df_preds_test[ID_COL] = id_test for idx, label in enumerate(label_candidates): df_preds_test[label] = preds_test[idx] df_preds_test.to_csv('../data/output/preds/gbdt/avg_submit.csv', index=False)
'num_leaves':[100], 'bagging_fraction': [0.8], 'bagging_freq':[5], 'min_data_in_leaf':[10], 'min_gain_to_split':[0], 'num_iterations':[500], 'lambda_l1':[1], 'lambda_l2':[0.1], 'verbose':[1] # 'is_unbalance':[True] } params=list(ParameterGrid(params)) lgbtrain=lgb.Dataset(X_train,label=y_train[:,1]) lgbeval=lgb.Dataset(X_val,label=y_val[:,1],reference=lgbtrain) best_ccc=0 for param in params: print(param) clf = lgb.cv(param, lgbtrain, nfold=5, num_boost_round=param['num_iterations'], \ early_stopping_rounds=50, feval=metric_ccc, verbose_eval=True) print(clf) # if clf.best_score['valid_0']['ccc value:']>best_ccc: # best_ccc=clf.best_score['valid_0']['ccc value:'] # best_param=param # best_it=clf.best_iteration # print('noval best interation: '+str(clf.best_iteration)) # y_pred=clf.predict(X_val) # y_pred2 = np.clip(correct(y_train[:,1], y_pred),-1,1) # ccc2,_=calccc.ccc(y_val[:,1],correct(y_train[:,1], y_pred2)) # print('best ccc:',best_ccc,'(',ccc2,')') # print('best param:',best_param) # print('best iteration:',best_it)