Пример #1
0
def main():

    cv_numiterations = params['num_iterations']
    print(params)
    for i in range(cnt):
        train_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format(i))
        train_lab = pd.read_csv(root_path + \
            'label_{}.csv'.format(i)).loc[:, 'label'].values

        # valid_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format((i + 1) % cnt))
        # valid_lab = pd.read_csv(root_path + \
        #     'label_{}.csv'.format((i + 1) % cnt)).loc[:, 'label'].values
        
        lgb_train = lgb.Dataset(train_fea, label=train_lab)
        # lgb_valid = lgb.Dataset(valid_fea, label=valid_lab, reference=lgb_train)

        print('cross-valid cnt={}/{}'.format(i + 1, cnt))

        # solver = lgb.train(cv_params, lgb_train, valid_sets=[lgb_train], \
        #     valid_names=['train'], verbose_eval=True, \
        #     num_boost_round=cv_numiterations, \
        #     early_stopping_rounds=cv_early_stopping_round)
        
        # print(solver.feature_importance())
        lgb.cv(params, lgb_train, \
            verbose_eval=True, \
            num_boost_round=cv_numiterations)
Пример #2
0
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        self._le = _LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)
        xgdmat = lgbm.Dataset(X, label=training_labels)
        #xgdmat.construct() 
        self.param_map.update({'objective':'binary'})
        #print('avant lgbm.cv')
        #print(self.param_map)
        # a verifier
#        if self.n_classes_ > 2:
#            self.param_map.update({'num_class':self.n_classes_})
#            self.param_map.update({'objective':'multi:softprob'})
        # Note: lgbm.cv reset the value of max_bin to 255
        self.results = lgbm.cv(self.param_map,
                               xgdmat,
                                self.num_boost_round,
                                self.folds,
                                self.nfold,
                                self.stratified,
                                self.shuffle,
                                self.metrics,
                                self.fobj,
                                self.feval,
                                self.init_model,
                                self.feature_name,
                                self.categorical_feature,
                                self.early_stopping_rounds,
                                self.fpreproc,
                                self.verbose_eval,
                                self.show_stdv,
                                self.seed,
                                self.callbacks)
Пример #3
0
def train_all(tdf, ydf, Tdf):
  lgbm_params =  {
      'task': 'train',
      'boosting_type': 'gbdt',
      'objective': 'binary',
      'metric': ['auc', 'binary_logloss'],
      'num_leaves': 31,
      'min_deal_in_leaf':1500,
      'feature_fraction': 0.7,
      'bagging_fraction': 0.7,
      'lambda_l1':1.0,
      'lambda_l2':1.0,
      'bagging_freq': 1,
      'learning_rate': 0.05,
      'max_bin':255,
      'verbose': 0
  }
  lgtrain = lgb.Dataset(tdf, ydf)
  cvr = lgb_clf = lgb.cv(
      lgbm_params,
      lgtrain,
      num_boost_round=1400,
      #valid_sets=[lgtrain, lgvalid],
      #valid_names=['train','valid'],
      nfold=5,
      #early_stopping_rounds=40,
      verbose_eval=5
  )
  print(cvr.keys())
  preds = lgb_clf.predict(Tdf)
  return preds
Пример #4
0
    def fit(self, X_train, y_train):
        dtrain = lgb.Dataset(data=X_train, label=y_train)

        if self.verbose:
            bst = lgb.cv(self.params, dtrain, num_boost_round=10000, nfold=3, early_stopping_rounds=50, verbose_eval=50)
        else:
            bst = lgb.cv(self.params, dtrain, num_boost_round=10000, nfold=3, early_stopping_rounds=50)

        if self.maximize:
            best_rounds = int(np.argmax(np.array(bst[self.metric + '-mean']) - np.array(bst[self.metric + '-stdv'])) * 1.5)
        else:
            best_rounds = int(np.argmin(np.array(bst[self.metric + '-mean']) + np.array(bst[self.metric + '-stdv'])) * 1.5)

        if self.verbose:
            print('Best Iteration: {}'.format(best_rounds))

        self.model = lgb.train(self.params, dtrain, best_rounds)
 def step_xgb(params):
     fix_params(params)
     cv = lgb.cv(params, dtrain,
                 num_boost_round=10000,
                 early_stopping_rounds=50,
                 nfold=6,
                 seed=params['seed'])
     rounds = np.argmin(cv['binary_logloss-mean'])
     score = np.min(cv['binary_logloss-mean'])
     print(cname, score, rounds, params, self.now())
     return dict(loss=score, status=STATUS_OK)
Пример #6
0
def lgb_cv(train_x, train_y, params, rounds, folds):
    start = time.clock()
    log(str(train_x.columns))
    dtrain = lgb.Dataset(train_x, label=train_y)
    log('run cv: ' + 'round: ' + str(rounds))
    res = lgb.cv(params, dtrain, rounds, nfold=folds, 
                 metrics=['eval_auc_f1', 'auc'], feval=eval_auc_f1, 
                 early_stopping_rounds=200, verbose_eval=5)
    elapsed = (time.clock() - start)
    log('Time used:' + str(elapsed) + 's')
    return len(res['feval-mean']), res['feval-mean'][len(res['feval-mean']) - 1], res['auc-mean'][len(res['auc-mean']) - 1]
Пример #7
0
 def cv(self, params, num_boost_round, feval):
     dtrain = lgbm.Dataset(data=self.X, label=self.y)
     bst = lgbm.cv(
         params=params,
         train_set=dtrain,
         nfold=self.N,
         folds=self.skf.split(self.X, self.y),
         num_boost_round=num_boost_round,
         metrics=['auc'],
         feval=feval,
         early_stopping_rounds=50,
         verbose_eval=10,
     )
     best_rounds = np.argmax(bst['gini-mean']) + 1
     best_score = np.max(bst['gini-mean'])
     logging.info('best rounds : {0}'.format(best_rounds))
     logging.info('best score : {0}'.format(best_score))
     logging.info('lightGBM params : \n{0}'.format(params))
     return best_rounds
Пример #8
0
def tune_n_iterations(df,
                      params,
                      n_splits=5,
                      target="target",
                      metrics=['multi_logloss'],
                      RANDOM_STATE=42,
                      verbose=1):
    """
    find best number of iterations by cv score
    return cv_results
    """
    skfold = StratifiedKFold(n_splits=n_splits, random_state=RANDOM_STATE)
    folds = skfold.split(X=df.drop(target, 1), y=df[target].values)
    trn_ds = lgb.Dataset(df.drop(target, 1), label=df[target])
    cv_results = lgb.cv(params=params,
                        train_set=trn_ds,
                        folds=folds,
                        metrics=metrics,
                        verbose_eval=verbose)
    cv_results = pd.DataFrame(cv_results)
    return cv_results
Пример #9
0
def gbm_cv(x_train,y_train,params):
    # params = {
    #         'boosting_type': 'gbdt',
    #         'objective': 'regression',
    #         'min_child_weight':10,
    #         'metric': 'rmse',
    #         # 'num_leaves': 8,
    #         'num_leaves': 4,
    #         # 'learning_rate': 0.1,
    #         'learning_rate': 0.05,
    #         'feature_fraction': 0.8,
    #         'bagging_fraction': 0.8,
    #         'bagging_freq': 5,
    #         'verbose': 1,
    #         'lambda_l2': 1
    #     }

    train_data = lgb.Dataset(x_train, y_train)
    bst=lgb.cv(params,train_data, num_boost_round=5000, nfold=5, early_stopping_rounds=100)
    # print bst
    return bst
Пример #10
0
        def rand_obj(space):
            """
            Defines some of the random search parameter space and objective function
            """
            
            subsample_dist = list(np.linspace(0.5, 1, 100))

            if space['boosting_type'] == 'goss':
                space['subsample'] = 1.0
            else:
                space['subsample'] = random.sample(subsample_dist, 1)[0]

            cv_result = lgb.cv(space, train_set, num_boost_round=NUM_BOOST_ROUNDS,
                    nfold=N_FOLDS, early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                    metrics=['auc', 'binary', 'xentropy'], seed=SEED)
            
            best_score = np.max(cv_result['auc-mean'])
            loss = 1 - best_score
            n_estimators = int(np.argmax(cv_result['auc-mean']) + 1)

            return [loss, params, n_estimators]
Пример #11
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
            metrics='l1', verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
Пример #12
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
            metrics='l1', verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
Пример #13
0
    def _get_score(self, params: Dict[str, Any], n: int) -> List[float]:
        """

        Parameters
        ----------
        params
            параметры дерева (которые мы ищем в этом классе)
        n
            количество кросс-валидаций для оценки гиперпараметров

        Returns
        -------

        """
        default_tree_params = {
            "boosting_type": "gbdt",
            "learning_rate": 1,
            "objective": "binary",
            "bagging_freq": 1,
            "bagging_fraction": 1,
            "feature_fraction": 1,
            "bagging_seed": 323,
            "n_jobs": 1,
            "verbosity": -1
        }
        unite_params = {**params, **default_tree_params}

        score = []
        for seed in range(n):
            cv_results = lgb.cv(params=unite_params,
                                train_set=self._lgb_train,
                                num_boost_round=1,
                                nfold=5,
                                metrics='auc',
                                stratified=True,
                                shuffle=True,
                                seed=seed)
            score.append(cv_results["auc-mean"])
        return score
Пример #14
0
    def adj_eta(self, lgbdata, seed=66, nfold=5, early_stopping_rounds=100):
        best_params = {}
        for eta in [0.01, 0.015, 0.025, 0.05, 0.1]:
            self.config.params['learning_rate'] = eta
            cv_results = lgb.cv(self.config.params,
                                lgbdata,
                                seed=seed,
                                nfold=nfold,
                                num_boost_round=self.config.num_boost_round,
                                early_stopping_rounds=early_stopping_rounds,
                                verbose_eval=0)
            if self.config.flag == -1:
                mean_merror = pd.Series(
                    cv_results[self.config.params['metric'] + '-mean']).max()
            else:
                mean_merror = pd.Series(
                    cv_results[self.config.params['metric'] + '-mean']).min()

            if mean_merror * self.config.flag < self.config.min_merror * self.config.flag:
                self.config.min_merror = mean_merror
                best_params['learning_rate'] = eta
        self.params.update(best_params)
Пример #15
0
    def __call__(self, trial: trial_module.Trial) -> float:
        params = self._get_params(trial)  # type: Dict[str, Any]
        dataset = copy.copy(self.dataset)
        callbacks = self._get_callbacks(trial)  # type: List[Callable]
        eval_hist = lgb.cv(
            params,
            dataset,
            callbacks=callbacks,
            early_stopping_rounds=self.early_stopping_rounds,
            feval=self.feval,
            fobj=self.fobj,
            folds=self.cv,
            init_model=self.init_model,
            num_boost_round=self.n_estimators,
        )  # Dict[str, List[float]]
        values = eval_hist[
            "{}-mean".format(self.eval_name)
        ]  # type: List[float]
        best_iteration = len(values)  # type: int

        trial.set_user_attr("best_iteration", best_iteration)

        trial_path = self.model_dir / "trial_{}".format(trial.number)

        trial_path.mkdir(exist_ok=True, parents=True)

        boosters = callbacks[0].boosters_  # type: ignore

        for i, b in enumerate(boosters):
            b.best_iteration = best_iteration

            b.free_dataset()

            booster_path = trial_path / "fold_{}.pkl".format(i)

            with booster_path.open("wb") as f:
                pickle.dump(b, f)

        return values[-1]
Пример #16
0
    def select_best_auc_for_cat(self, func_cat=None, group='ID',cat_feats=None,lgb_params=None,df_id=None):
        # func_cat为_aggFeature中的纵向聚合类别特征函数
        auc_list = []
        df = self.data.copy()
        for feature in tqdm_notebook(cat_feats):
            # 0.1 计数特征 value_counts
            print(feature, '####################################################')
            ftr_ = df[[group, feature]].copy()
            a = ftr_[feature].value_counts()
            a = pd.DataFrame(list(zip(a.index, a.values)), columns=[feature, 'vcounts'])
            ftr_ = ftr_.merge(a, 'left', on=feature)
            # 0.2 排序特征
            a = LabelEncoder()
            a_ = a.fit_transform(ftr_[feature])
            ftr_['rank'] = a_
            # 0.3 得到的vcounts和rank都当成类别特征,然后跑一下聚合特征加法和特征减法,进行单特征评测
            new_df = func_cat(ftr_, group=group, feats=[feature,'vcounts', 'rank'])
            fs = FeatureSelector(new_df.drop([group], 1))  # 把 FeatureSelector 加载进来别忘了
            fs.identify_collinear(correlation_threshold=0.98)
            new_df.drop(fs.ops['collinear'], axis=1, inplace=True)
            if df_id is not None:
                new_df = df_id.merge(new_df, 'left', on=group)

            y = new_df['LABEL'].copy()
            X = new_df.drop([group, 'LABEL'], axis=1).copy()
            lgb_data = lgb.Dataset(X, y)

            model_cv = lgb.cv(
                lgb_params,
                lgb_data,
                num_boost_round=2000,
                nfold=5,
                stratified=False,  ########stratified回归
                early_stopping_rounds=100,
                verbose_eval=50,
                show_stdv=True)
            auc_list.append((model_cv['auc-mean'][-1], feature))
        auc_list.sort(reverse=True)
        return auc_list
Пример #17
0
def objective(params, n_folds = N_FOLDS):    
    global ITERATION
    ITERATION += 1
    subsample = params['boosting_type'].get('subsample', 1.0)
    params['boosting_type'] = params['boosting_type']['boosting_type']
    params['subsample'] = subsample
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        params[parameter_name] = int(params[parameter_name])
    start = timer()
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, nfold = n_folds, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)
    
    run_time = timer() - start
    best_score = np.max(cv_results['auc-mean'])
    loss = 1 - best_score
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'estimators': n_estimators, 
            'train_time': run_time, 'status': STATUS_OK}
Пример #18
0
    def test_fpreproc(self):
        def preprocess_data(dtrain, dtest, params):
            train_data = dtrain.construct().get_data()
            test_data = dtest.construct().get_data()
            train_data[:, 0] += 1
            test_data[:, 0] += 1
            dtrain.label[-5:] = 3
            dtest.label[-5:] = 3
            dtrain = lgb.Dataset(train_data, dtrain.label)
            dtest = lgb.Dataset(test_data, dtest.label, reference=dtrain)
            params['num_class'] = 4
            return dtrain, dtest, params

        X, y = load_iris(True)
        dataset = lgb.Dataset(X, y, free_raw_data=False)
        params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
        results = lgb.cv(params,
                         dataset,
                         num_boost_round=10,
                         fpreproc=preprocess_data)
        self.assertIn('multi_logloss-mean', results)
        self.assertEqual(len(results['multi_logloss-mean']), 10)
Пример #19
0
def lgb_eval(num_leaves,  max_depth, lambda_l2,lambda_l1, min_child_samples, bagging_fraction,
             feature_fraction, min_child_weight):
    params = {
        "objective": "binary",
        "metric": "auc",
        "num_leaves": int(num_leaves),
        "max_depth": int(max_depth),
        "lambda_l2": lambda_l2,
        "lambda_l1": lambda_l1,
        "num_threads": 32,
        "min_child_samples": int(min_child_samples),
        "min_child_weight": min_child_weight,
        "learning_rate": 0.05,
        "bagging_fraction": bagging_fraction,
        "feature_fraction": feature_fraction,
        "seed": 2020,
        "verbosity": -1
    }
    train_df = lgb.Dataset(train_X, train_set.label)
    scores = lgb.cv(params, train_df, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=False,
                     nfold=3)['auc-mean'][-1]
    return scores
def test_one_param(df, label, params):
    iterations = []

    kf = KFold(n_splits=5, shuffle=True, random_state=0)

    for i, (train_index, test_index) in enumerate(kf.split(df)):
        x_train, x_test, y_train, y_test = \
            df.loc[train_index, :], df.loc[test_index, :], \
            label.loc[train_index, :], label.loc[test_index, :]

        lgb_train = lgb.Dataset(x_train,
                                y_train["label"].values,
                                silent=True,
                                weight=y_train["weight"].values)
        lgb_test = lgb.Dataset(x_test,
                               y_test["label"].values,
                               silent=True,
                               weight=y_test["weight"].values)

        cv_results = lgb.cv(params,
                            lgb_train,
                            num_boost_round=1000,
                            nfold=5,
                            stratified=False,
                            shuffle=True,
                            early_stopping_rounds=50,
                            seed=0)

        print(cv_results)
        params['num_iterations'] = len(cv_results['multi_logloss-mean'])
        iterations.append(params['num_iterations'])

        bst = lgb.train(
            params,
            lgb_train,
            valid_sets=lgb_test,
        )

    return iterations
Пример #21
0
def lgbm_cv(y, lgtrain):
    print("Light Gradient Boosting Classifier: ")
    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': len(set(y)),
        'metric': ['multi_logloss'],
        "learning_rate": 0.05,
        "num_leaves": 80,
        "max_depth": 6,
        "feature_fraction": 0.70,
        "bagging_fraction": 0.75,
        "reg_alpha": 0.15,
        "reg_lambda": 0.15,
        "min_child_weight": 0,
        "verbose": 0
    }

    modelstart = time.time()
    # Find Optimal Parameters / Boosting Rounds
    lgb_cv = lgb.cv(params=lgbm_params,
                    train_set=lgtrain,
                    num_boost_round=2000,
                    stratified=True,
                    nfold=5,
                    verbose_eval=100,
                    seed=23,
                    early_stopping_rounds=75)

    loss = lgbm_params["metric"][0]
    optimal_rounds = np.argmin(lgb_cv[str(loss) + '-mean'])
    best_cv_score = min(lgb_cv[str(loss) + '-mean'])

    print("\nOptimal Round: {}\nOptimal Score: {} + {}".format(
        optimal_rounds, best_cv_score,
        lgb_cv[str(loss) + '-stdv'][optimal_rounds]))

    return lgbm_params, optimal_rounds, best_cv_score
Пример #22
0
    def cv(self, df_train):
        if self.d_train == None:
            self.init_cv(df_train)

        params = copy.deepcopy(self.params)
        cv_result = lgb.cv(params, self.d_train, num_boost_round=3000, nfold=5, stratified=True, \
                           shuffle=True, init_model=None, feature_name='auto', \
                           categorical_feature='auto', early_stopping_rounds=300, \
                           fpreproc=None, verbose_eval=True, show_stdv=True, seed=1234, callbacks=None)

        self.print_cv_result(cv_result)
        # best results shown as follow, finally, choose max_depth=4, leavs=16, lr=0.02, ff=0.95

        # best 6, 48, 555, lr=0.02 0.193635
        # best 6, 32, 564, lr=0.02 0.193498
        # best 4, 32 or 16, 1034, lr=0.02 0.193194
        # best 4, 32 or 16, 1016, ff=0.95 lr=0.02 0.193161
        # best 4, 16, 982, ff=0.95, lr=0.02, full candidates, features 0.191541
        # best 4, 16, 1100, ff=0.9, lr=0.02, full candidates, features 0.191541

        # best 4, 16, 843, ff=0.6, lr=0.02, full + extra user_cat_20 detail features, 0.19145244312795784
        return cv_result
Пример #23
0
def LGB(train, test):
    train_x = train.drop(['orderType', 'userid'], axis=1)
    train_y = train.orderType.values

    print(train_x.shape)
    print(len(train_y))

    import lightgbm as lgb

    param = {}
    param['task'] = 'train'
    param['boosting_type'] = 'gbdt'
    param['objective'] = 'binary'
    param['metric'] = 'auc'
    param['min_sum_hessian_in_leaf'] = 0.1
    param['learning_rate'] = 0.01
    param['verbosity'] = 2
    param['tree_learner'] = 'feature'
    param['num_leaves'] = 128
    param['feature_fraction'] = 0.7
    param['bagging_fraction'] = 0.7
    param['bagging_freq'] = 1
    param['num_threads'] = 16

    dtrain = lgb.Dataset(train_x, label=train_y)
    res1gb = lgb.cv(param,
                    dtrain,
                    5500,
                    nfold=5,
                    early_stopping_rounds=100,
                    verbose_eval=20)
    ro = len(res1gb['auc-mean'])
    model = lgb.train(param, dtrain, ro, valid_sets=[dtrain], verbose_eval=500)
    test_x = test[train_x.columns.values]
    y = model.predict(test_x)
    test['orderType'] = y
    test[['userid', 'orderType'
          ]].to_csv('../result/lgb_baseline' + str(version) + '.csv',
                    index=False)
Пример #24
0
def lgb_modelling(train, test, option_params, ID, y):

    #lightGBM用のデータセットに変換
    X_train, train_lgb = make_dataset(train, ID, y)
    X_test, _ = make_dataset(test, ID, y)

    #ハイパラの選択肢を作る
    params = list(ParameterGrid(option_params))

    #GridSearchを行う
    min_score = np.inf
    for param in tqdm(params):
        result = lgb.cv(params=param,
                        num_boost_round=1000,
                        train_set=train_lgb,
                        early_stopping_rounds=10,
                        stratified=True,
                        verbose_eval=False)

        metric = param['metric']
        score = min(result[metric + '-mean'])
        print(param, score)
        if min_score > score:
            min_score = score
            min_score_iteration = len(result[metric + '-mean'])
            best_hyper_param = param

    #このハイパラでもう一度学習予測
    bst = lgb.train(params=best_hyper_param,
                    num_boost_round=min_score_iteration,
                    train_set=train_lgb,
                    verbose_eval=False)

    #テストデータに対して予測をする
    pred_test = bst.predict(X_test)
    pred_test = np.exp(pred_test)
    pred_test[pred_test < 0] == 0

    return pred_test, min_score, bst
Пример #25
0
        def objective(space_params):

            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])

            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate

            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            space_params['feature_pre_filter'] = False

            train = lgb.Dataset(data, labels)

            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
            cv_results = lgb.cv(space_params,
                                train,
                                nfold=N_FOLDS,
                                stratified=False,
                                early_stopping_rounds=100,
                                metrics=EVAL_METRIC_LGBM_REG,
                                seed=42)

            best_loss = cv_results['l1-mean'][-1]  #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return {'loss': best_loss, 'status': STATUS_OK}
Пример #26
0
    def cv(self, X=None, y=None, k_fold=5, dataset_train=None):
        logger.warning(
            "Warning: Running GBM cross-validation. This is currently unstable."
        )
        try_import_lightgbm()
        import lightgbm as lgb
        if dataset_train is None:
            dataset_train, _ = self.generate_datasets(X_train=X, Y_train=y)
        gc.collect()
        params = copy.deepcopy(self.params)
        eval_metric = self.get_eval_metric()
        # TODO: Either edit lgb.cv to return models / oof preds or make custom implementation!
        cv_params = {
            'params': params,
            'train_set': dataset_train,
            'num_boost_round': self.num_boost_round,
            'nfold': k_fold,
            'early_stopping_rounds': 150,
            'verbose_eval': 1000,
            'seed': 0,
        }
        if type(eval_metric) != str:
            cv_params['feval'] = eval_metric
            cv_params['params']['metric'] = 'None'
        else:
            cv_params['params']['metric'] = eval_metric
        if self.problem_type == REGRESSION:
            cv_params['stratified'] = False

        logger.log(15, 'Current parameters:')
        logger.log(15, params)
        eval_hist = lgb.cv(
            **cv_params
        )  # TODO: Try to use customer early stopper to enable dart
        best_score = eval_hist[self.eval_metric_name + '-mean'][-1]
        logger.log(15, 'Best num_boost_round: %s ',
                   len(eval_hist[self.eval_metric_name + '-mean']))
        logger.log(15, 'Best CV score: %s' % best_score)
        return best_score
Пример #27
0
    def adj_min_child_weight(self,
                             seed=66,
                             nfold=5,
                             early_stopping_rounds=100):
        best_params = {}
        lgb_train = lgb.Dataset(self.df_train[self.best_feature].values,
                                self.df_train[self.label_name].values)
        for min_child_weight in [1, 3, 5, 7]:
            self.params['min_child_weight'] = min_child_weight
            cv_results = lgb.cv(self.params,
                                lgb_train,
                                seed=seed,
                                nfold=nfold,
                                num_boost_round=self.num_boost_round,
                                early_stopping_rounds=early_stopping_rounds,
                                verbose_eval=0)
            mean_merror = pd.Series(cv_results['l1-mean']).min()

            if mean_merror <= self.min_merror:
                self.min_merror = mean_merror
                best_params['min_child_weight'] = min_child_weight
        self.params.update(best_params)
Пример #28
0
def score_feature_selection(df=None,
                            train_features=None,
                            cat_feats=None,
                            target=None):
    # Fit LightGBM
    dtrain = lgb.Dataset(df[train_features],
                         target,
                         free_raw_data=False,
                         silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'learning_rate': .1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'num_leaves': 31,
        'max_depth': -1,
        'seed': 13,
        'n_jobs': 4,
        'min_split_gain': .00001,
        'reg_alpha': .00001,
        'reg_lambda': .00001,
        'metric': 'auc'
    }

    # Fit the model
    hist = lgb.cv(params=lgb_params,
                  train_set=dtrain,
                  num_boost_round=2000,
                  categorical_feature=cat_feats,
                  nfold=5,
                  stratified=True,
                  shuffle=True,
                  early_stopping_rounds=50,
                  verbose_eval=0,
                  seed=17)
    # Return the last mean / std values
    return hist['auc-mean'][-1], hist['auc-stdv'][-1]
Пример #29
0
    def num_estimators(self,
                       X,
                       y,
                       n_folds=3,
                       learning_rate=0.3,
                       n_estimators=500,
                       early_stop=20,
                       **kwargs):
        """
		Setting up a hight learning rate and using early stopping to find the
		number of estimators.
		"""

        params = {}
        params['learning_rate'] = learning_rate

        # overwrite if something is passed as kwargs
        for k, v in params.items():
            if k in kwargs.keys(): params[k] = kwargs[k]

        params['verbose'] = -1
        dtrain = lgb.Dataset(X,
                             label=y,
                             feature_name=self.feature_names,
                             categorical_feature=self.categorical_feat,
                             free_raw_data=False)
        self.data = dtrain
        cv_result = lgb.cv(params,
                           dtrain,
                           nfold=3,
                           metrics=self.metric,
                           num_boost_round=n_estimators,
                           early_stopping_rounds=20,
                           stratified=False)
        self.params['n_estimators'] = len(cv_result[self.metric + '-mean'])
        self.params['learning_rate'] = learning_rate

        return self
Пример #30
0
def sample(dtrain, dtest):
    predictors =[]
    dummies = []
    target = None
    # 01.    train    set and test    set
    train_data = lgb.Dataset(dtrain[predictors], label=dtrain[target], feature_name=list(dtrain[predictors].columns),
                             categorical_feature=dummies)

    test_data = lgb.Dataset(dtest[predictors], label=dtest[target], feature_name=list(dtest[predictors].columns),
                            categorical_feature=dummies)

    # // 02.    parameters
    param = {
        # num_leaves = 2^(max_depth) 2 **(max_depth)
        'max_depth': 6,
        'num_leaves': 64,
        'learning_rate': 0.03,
        'scale_pos_weight': 1,
        'num_threads': 40,
        'objective': 'binary',
        # Bagging参数:bagging_fraction+bagging_freq(必须同时设置)、feature_fraction
        'bagging_fraction': 0.7,
        'bagging_freq': 1,
        'min_sum_hessian_in_leaf': 100
    }

    # 样本分布非平衡数据集
    param['is_unbalance'] = 'True'
    param['metric'] = 'auc'
    param['stratified'] = 'True'

    # // 03.    cv and train
    bst = lgb.cv(param, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30)

    estimators = lgb.train(param, train_data, num_boost_round=len(bst['auc-mean']))

    # // 04.    test    predict
    ypred = estimators.predict(dtest[predictors])
def lgbm_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_alpha,
              reg_lambda, min_split_gain, min_child_weight, min_data_in_leaf):

    params = dict()
    params["learning_rate"] = 0.01
    #    params["silent"] = True
    params['device'] = 'gpu'
    #    params["nthread"] = 16
    params['objective'] = 'regression'
    params['seed'] = 326,

    params["num_leaves"] = int(num_leaves)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['subsample'] = max(min(subsample, 1), 0)
    params['max_depth'] = int(max_depth)
    params['reg_alpha'] = max(reg_alpha, 0)
    params['reg_lambda'] = max(reg_lambda, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    params['min_data_in_leaf'] = int(min_data_in_leaf)
    params['verbose'] = -1

    folds = get_folds(df=TRAIN_DF['totals.pageviews_MEAN'].reset_index(),
                      n_splits=NUM_FOLDS)

    clf = lightgbm.cv(
        params=params,
        train_set=lgbm_train,
        metrics=['rmse'],
        nfold=NUM_FOLDS,
        folds=folds,
        num_boost_round=10000,  # early stopありなのでここは大きめの数字にしてます
        early_stopping_rounds=200,
        verbose_eval=100,
        seed=47,
    )
    gc.collect()
    return -clf['rmse-mean'][-1]
Пример #32
0
def objective(params):
    global cnt, new_max
    params = {
        'num_leaves': int(params['num_leaves']),
        'max_bin': int(params['max_bin']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        'lambda_l2': int(params['lambda_l2']),
    }
    cv_data = lg.cv(params,
                    data,
                    num_boost_round=4000,
                    nfold=5,
                    seed=2332,
                    stratified=False,
                    early_stopping_rounds=5,
                    metrics='rmse')
    score = cv_data['rmse-mean'][-1]

    # saving score to a file
    if score < new_max:
        new_max = score
        print("############### Score: {0}".format(score))
        print("############### Prms: ", params)
        print('..........................')
        with open("cv_lightgbm.txt", "a") as myfile:
            myfile.write(f'''
            ############### Score: {cnt}
            ############### Score: {score}
            ############### Prms:{params}
            \n
            ''')
    cnt += 1
    return {
        'loss': score,
        'status': STATUS_OK,
        'eval_time': time.time(),
    }
Пример #33
0
def lgbfit(alg,
           dtrain,
           predictors,
           useTrainCV=True,
           cv_folds=5,
           early_stopping_rounds=50,
           dtest=None):

    starttime = datetime.datetime.now()
    if useTrainCV:
        lgb_param = alg.get_params()
        ltrain = lgb.Dataset(dtrain[predictors].values,
                             label=dtrain['target'].values)
        #        ltest = lgb.Dataset(dtest[predictors].values)
        cvresult = lgb.cv(lgb_param,
                          ltrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds,
                          verbose_eval=False,
                          metrics='auc')
        alg.set_params(n_estimators=len(cvresult['auc-mean']))
        print("cv score:", cvresult['auc-mean'][-1])

    #fit
    alg.fit(dtrain[predictors], dtrain['target'], eval_metric='auc')

    #prediction on train set
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
    endtime = datetime.datetime.now()

    #output
    print("accuracy: ",
          metrics.accuracy_score(dtrain['target'].values, dtrain_predictions))
    print("AUC score:", metrics.roc_auc_score(dtrain['target'],
                                              dtrain_predprob))
    print("time spent: ", (endtime - starttime).seconds, "s")
def objective(hyperparameters, iteration):
    """Objective function for grid and random search. Returns
       the cross validation score from a set of hyperparameters."""

    # Number of estimators will be found using early stopping
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']

    # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters,
                        train_set,
                        num_boost_round=10000,
                        nfold=N_FOLDS,
                        early_stopping_rounds=100,
                        metrics='auc',
                        seed=42)

    # results to retun
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    hyperparameters['n_estimators'] = estimators

    return [score, hyperparameters, iteration]
Пример #35
0
def train_model(train_df, train_y, test_df):
    print("Training on: {}".format(train_df.shape, train_y.shape))

    train_lgb = lgb.Dataset(train_df, train_y)
    model = lgb.train(params, train_lgb, num_boost_round=ROUNDS)
    preds = model.predict(test_df)

    print("Features importance...")
    gain = model.feature_importance('gain')
    ft = pd.DataFrame({
        'feature': model.feature_name(),
        'split': model.feature_importance('split'),
        'gain': 100 * gain / gain.sum()
    }).sort_values('gain', ascending=False)
    print(ft.head(25))

    score = lgb.cv(params, train_lgb, metrics="l2_root")
    '''
    plt.figure()
    ft[['feature','gain']].head(25).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(10, 20))
    plt.gcf().savefig('features_importance.png')
    '''
    return model, preds, score
Пример #36
0
Файл: lgb2.py Проект: zyjcs/test
 def cv(self,
        X,
        y,
        nfold=5,
        num_round=8000,
        early_stopping=10,
        verbose=True,
        params={}):
     trainParam = self.params
     trainParam.update(params)
     trainData = lgb.Dataset(X,
                             label=y,
                             feature_name=self.feaName,
                             categorical_feature=self.cateFea)
     result = lgb.cv(trainParam,
                     trainData,
                     feature_name=self.feaName,
                     categorical_feature=self.cateFea,
                     num_boost_round=num_round,
                     nfold=nfold,
                     early_stopping_rounds=early_stopping,
                     verbose_eval=verbose)
     return result
Пример #37
0
    def adj_fraction(self, seed=66, nfold=5, early_stopping_rounds=100):
        best_params = {}
        lgb_train = lgb.Dataset(self.df_train[self.best_feature].values,
                                self.df_train[self.label_name].values)
        for feature_fraction in [0.6, 0.7, 0.8, 0.9, 1]:
            for bagging_fraction in [0.6, 0.7, 0.8, 0.9, 1]:
                self.params['feature_fraction'] = feature_fraction
                self.params['bagging_fraction'] = bagging_fraction
                cv_results = lgb.cv(
                    self.params,
                    lgb_train,
                    seed=seed,
                    nfold=nfold,
                    num_boost_round=self.num_boost_round,
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=0)
                mean_merror = pd.Series(cv_results['l1-mean']).min()

                if mean_merror <= self.min_merror:
                    self.min_merror = mean_merror
                    best_params['feature_fraction'] = feature_fraction
                    best_params['bagging_fraction'] = bagging_fraction
        self.params.update(best_params)
Пример #38
0
    def adj_bin_leafdata(self, seed=66, nfold=5, early_stopping_rounds=100):
        best_params = {}
        lgb_train = lgb.Dataset(self.df_train[self.best_feature].values,
                                self.df_train[self.label_name].values)
        for max_bin in range(100, 255, 10):
            for min_data_in_leaf in range(10, 200, 10):
                self.params['max_bin'] = max_bin
                self.params['min_data_in_leaf'] = min_data_in_leaf
                cv_results = lgb.cv(
                    self.params,
                    lgb_train,
                    seed=seed,
                    nfold=nfold,
                    num_boost_round=self.num_boost_round,
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=0)
                mean_merror = pd.Series(cv_results['l1-mean']).min()

                if mean_merror <= self.min_merror:
                    self.min_merror = mean_merror
                    best_params['max_bin'] = max_bin
                    best_params['min_data_in_leaf'] = min_data_in_leaf
        self.params.update(best_params)
Пример #39
0
 def test_cv(self):
     lgb_train, _ = template.test_template(return_data=True)
     lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
Пример #40
0
     'learning_rate': learning_rate,
     'max_depth': max_depth,
     'metric':'auc',
     'min_data_in_leaf': min_data_in_leaf,
     'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
     'num_leaves': num_leaves,
     'n_jobs': 30,
     'tree_learner': 'serial',
     'objective': 'binary',
     'verbosity': -1
 }
 print(">>>>",i,"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
 print(param)
 model = lgb.cv(param, trn_data, 1000000,
            feature_name=X.columns.tolist(),
            verbose_eval=500,
            early_stopping_rounds = 4000,
            nfold=4)
 num_leaves_list.append(num_leaves)
 learning_rate_list.append(learning_rate)
 feature_fraction_list.append(feature_fraction)
 bagging_fraction_list.append(bagging_fraction)
 #max_bin_list.append(max_bin)
 scale_pos_weight_list.append(scale_pos_weight)
 max_depth_list.append(max_depth)
 bagging_freq_list.append(bagging_freq)
 min_data_in_leaf_list.append(min_data_in_leaf)
 min_sum_hessian_in_leaf_list.append(min_sum_hessian_in_leaf)
 nround_p.append(len(model['auc-mean']))
 auc_cv_mean.append(model['auc-mean'][len(model['auc-mean'])-1])
 auc_cv_std.append(model['auc-stdv'][len(model['auc-mean'])-1])
Пример #41
0
def run_cv():
    # read train/test data
    df_train = read_train_data()
    X_train = df_train[COMMENT_COL].values
    id_train = df_train[ID_COL].values.tolist()
    y_true = df_train[label_candidates].values

    df_test = read_test_data()
    X_test = df_test[COMMENT_COL].values
    id_test = df_test[ID_COL].values.tolist()

    # extract n-gram word level feature
    print('Extracting n-gram features')
    extractor_word = get_extractor_word()
    extractor_word.fit(pd.concat((df_train.loc[:, COMMENT_COL], df_test.loc[:, COMMENT_COL])))
    X_train_word = conduct_transform(extractor_word, X_train)
    X_test_word = conduct_transform(extractor_word, X_test)
    print('Train word data shape : {0}'.format(X_train_word.shape))
    print('Test word data shape : {0}'.format(X_test_word.shape))

    # extract n-gram char level feature
    extractor_char = get_extractor_char()
    extractor_char.fit(pd.concat((df_train.loc[:, COMMENT_COL], df_test.loc[:, COMMENT_COL])))
    X_train_char = conduct_transform(extractor_char, X_train)
    X_test_char = conduct_transform(extractor_char, X_test)
    print('Train char data shape : {0}'.format(X_train_char.shape))
    print('Test char data shape : {0}'.format(X_test_char.shape))

    # combine word and char
    X_train_word_char = hstack([X_train_word, X_train_char])
    X_test_word_char = hstack([X_test_word, X_test_char])
    X_train_word_char = X_train_word_char.tocsr()
    X_test_word_char = X_test_word_char.tocsr()
    print('Train word char data shape : {0}'.format(X_train_word_char.shape))
    print('Test word char data shape : {0}'.format(X_test_word_char.shape))


    # get idx of trn/val for each fold
    print('Getting array index of train/valid for each fold')
    idx_trn_val = get_idx_trn_val(id_train)

    # preds on test/valid
    preds_test = np.zeros((X_test_word_char.shape[0], n_classes))
    preds_valid = np.zeros((X_train_word_char.shape[0], n_classes))

    # cv and train/predict
    for label_col, label_name in enumerate(label_candidates):
        print('\nlabel column : {0}'.format(label_col))
        print('label name : {0}'.format(label_name))
        # cv best boost rounds
        train_set = lgbm.Dataset(data=X_train_word_char, label=y_true[:, label_col])
        params = get_gbdt_params(label_name)
        print('lgbm params : {0}'.format(params))
        hist = lgbm.cv(
            params=params,
            train_set=train_set,
            folds=idx_trn_val,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            metrics=['auc'],
            verbose_eval=verbose_eval,
        )
        bst_boost_rounds = np.argmax(hist['auc-mean']) + 1
        bst_acc_score = np.max(hist['auc-mean'])
        print('label column : {0}'.format(label_col))
        print('label name : {0}'.format(label_name))
        print('best boost rounds {0}, best auc score {1}'.format(bst_boost_rounds, bst_acc_score))

        # oof train and predict
        for fold, (idx_trn, idx_val) in enumerate(idx_trn_val):
            print('fold {0}'.format(fold))
            print('    train')
            dat_trn = lgbm.Dataset(data=X_train_word_char[idx_trn, :], label=y_true[idx_trn, label_col])
            model = lgbm.train(
                params=params,
                train_set=dat_trn,
                num_boost_round=bst_boost_rounds,
                verbose_eval=verbose_eval,
            )
            print('    predict')
            preds_valid[idx_val, label_col] = model.predict(
                data=X_train_word_char[idx_val,:], num_iteration=bst_boost_rounds)
            preds_test[:,label_col] = model.predict(data=X_test_word_char, num_iteration=bst_boost_rounds) / K
            del model

    # ensemble cv score
    score = roc_auc_score(y_true=y_true, y_score=preds_valid, average='macro')
    print('\ncv score : {0}'.format(score))
    # divide data for ensemble
    for fold, (_, idx_val) in enumerate(idx_trn_val):
        preds_valid_fold = preds_valid[idx_val,:].T
        df_preds_val = pd.DataFrame()
        idx_val_set = set(idx_val)
        df_preds_val[ID_COL] = [ id for idx, id in enumerate(id_train) if idx in idx_val_set ]
        for idx, label in enumerate(label_candidates):
            df_preds_val[label] = preds_valid_fold[idx]
        df_preds_val.to_csv('../data/output/preds/gbdt/{0}fold_valid.csv'.format(fold), index=False)

    # record ensemble result
    preds_test = preds_test.T
    df_preds_test = pd.DataFrame()
    df_preds_test[ID_COL] = id_test
    for idx, label in enumerate(label_candidates):
        df_preds_test[label] = preds_test[idx]
    df_preds_test.to_csv('../data/output/preds/gbdt/avg_submit.csv', index=False)
Пример #42
0
             'metric': 'rmse',
             'num_leaves': num_leaves,
             'learning_rate': learning_rate,
             'feature_fraction': feature_fraction,
             'bagging_fraction': bagging_fraction,
             'max_bin': max_bin,
             'max_depth': max_depth,
             'bagging_freq': bagging_freq,
             'verbose': 0
         }
 print(">>>>",i,"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
 print(params)
 model = lgb.cv(params,
                lgb_train,
                num_boost_round=100000,
                early_stopping_rounds=50,
                feature_name=all_data.columns.tolist(),
                nfold=4,
                stratified=False)
 num_leaves_list.append(num_leaves)
 learning_rate_list.append(learning_rate)
 feature_fraction_list.append(feature_fraction)
 bagging_fraction_list.append(bagging_fraction)
 max_bin_list.append(max_bin)
 max_depth_list.append(max_depth)
 bagging_freq_list.append(bagging_freq)
 nround_p.append(len(model['rmse-mean']))
 rmse_cv_mean.append(model['rmse-mean'][len(model['rmse-mean'])-1])
 rmse_cv_std.append(model['rmse-stdv'][len(model['rmse-mean'])-1])
 i = i + 1
 grid = pd.DataFrame({
Пример #43
0
## Training
num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
bst = lgb.train(param, train_data, num_round)
bst.save_model('tmp.model')
# The trained model can also be dumped to JSON format:
json_model = bst.dump_model()
print(json_model)  # 好长...
##################################################################
## Booster
bst = lgb.Booster(model_file='tmp.model')  # 这里把加载文件设置为参数了
##################################################################
## CV
# Training with 5-fold CV:
num_round = 10
lgb.cv(param, train_data, num_round, nfold=5)
##################################################################
## Early Stopping
# If you have a validation set, you can use early stopping to find the optimal number of boosting rounds.
# Early stopping requires at least one set in valid_sets. If there is more than one, it will use all of them:
bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, early_stopping_rounds=10)
bst.save_model('model.txt', num_iteration=bst.best_iteration)
# The model will train until the validation score stops improving.
# Validation error needs to improve at least every early_stopping_rounds to continue training.

# If early stopping occurs, the model will have an additional field: bst.best_iteration.
# Note that train() will return a model from the last iteration, not the best one.
# And you can set num_iteration=bst.best_iteration when saving model.

# This works with both metrics to minimize (L2, log loss, etc.) and to maximize (NDCG, AUC).
# Note that if you specify more than one evaluation metric, all of them will be used for early stopping.
Пример #44
0
    preds_test = np.zeros((test_submit.shape[0], n_classes))
    preds_valid = np.zeros((y_true.shape[0], n_classes))

    params = get_gbdt_params()

    for label_col, label_name in enumerate(label_candidates):
        print('label column : {0}'.format(label_col))
        print('label name : {0}'.format(label_name))
        # cv best boost rounds
        train_set = lgbm.Dataset(data=preds_score, label=y_true[:, label_col])
        hist = lgbm.cv(
            params=params,
            train_set=train_set,
            folds=idx_trn_val,
            num_boost_round=2000,
            early_stopping_rounds=50,
            metrics=['auc'],
            verbose_eval=10,
        )
        bst_boost_rounds = np.argmax(hist['auc-mean']) + 1
        bst_acc_score = np.max(hist['auc-mean'])
        print('label column : {0}'.format(label_col))
        print('label name : {0}'.format(label_name))
        print('best boost rounds {0}, best auc score {1}'.format(bst_boost_rounds, bst_acc_score))

        # oof train and predict
        for fold, (idx_trn, idx_val) in enumerate(idx_trn_val):
            print('oof train for label : {0}'.format(label_name))
            dat_trn = lgbm.Dataset(data=preds_score[idx_trn, :], label=y_true[idx_trn, label_col])
            model = lgbm.train(
def run_cv():
    # read train/test data
    df_train = read_train_data()
    id_train = df_train[ID_COL].values.tolist()
    y_true = df_train[label_candidates].values

    df_test = read_test_data()
    id_test = df_test[ID_COL].values.tolist()

    get_indicators_and_clean_comments(df=df_train)
    get_indicators_and_clean_comments(df=df_test)

    num_features = [f_ for f_ in df_train.columns
                    if f_ not in ["comment_text", "clean_comment", "id", "remaining_chars",
                                  'has_ip_address'] + label_candidates]
    skl = MinMaxScaler()
    train_num_features = csr_matrix(skl.fit_transform(df_train[num_features]))
    test_num_features = csr_matrix(skl.fit_transform(df_test[num_features]))

    # Get TF-IDF features
    train_text = df_train['clean_comment']
    test_text = df_test['clean_comment']
    all_text = pd.concat([train_text, test_text])

    # First on real words
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        stop_words='english',
        ngram_range=(1, 2),
        max_features=20000)
    word_vectorizer.fit(all_text)
    train_word_features = word_vectorizer.transform(train_text)
    test_word_features = word_vectorizer.transform(test_text)
    del word_vectorizer
    gc.collect()

    # Now use the char_analyzer to get another TFIDF
    char_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        tokenizer=char_analyzer,
        analyzer='word',
        ngram_range=(1, 1),
        max_features=50000)
    char_vectorizer.fit(all_text)
    train_char_features = char_vectorizer.transform(train_text)
    test_char_features = char_vectorizer.transform(test_text)

    del char_vectorizer
    gc.collect()

    X_train_word_char = hstack([train_char_features, train_word_features, train_num_features]).tocsr()
    X_test_word_char = hstack([test_char_features, test_word_features, test_num_features]).tocsr()

    # get idx of trn/val for each fold
    print('Getting array index of train/valid for each fold')
    idx_trn_val = get_idx_trn_val(id_train)

    # preds on test/valid
    preds_test = np.zeros((X_test_word_char.shape[0], n_classes))
    preds_valid = np.zeros((X_train_word_char.shape[0], n_classes))

    # cv and train/predict
    for label_col, label_name in enumerate(label_candidates):
        print('\nlabel column : {0}'.format(label_col))
        print('label name : {0}'.format(label_name))
        # cv best boost rounds
        train_set = lgbm.Dataset(data=X_train_word_char, label=y_true[:, label_col])
        params = get_gbdt_params(label_name)
        print('lgbm params : {0}'.format(params))
        hist = lgbm.cv(
            params=params,
            train_set=train_set,
            folds=idx_trn_val,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            metrics=['auc'],
            verbose_eval=verbose_eval,
        )
        bst_boost_rounds = np.argmax(hist['auc-mean']) + 1
        bst_acc_score = np.max(hist['auc-mean'])
        print('label column : {0}'.format(label_col))
        print('label name : {0}'.format(label_name))
        print('best boost rounds {0}, best auc score {1}'.format(bst_boost_rounds, bst_acc_score))

        # oof train and predict
        for fold, (idx_trn, idx_val) in enumerate(idx_trn_val):
            print('fold {0}'.format(fold))
            print('    train')
            dat_trn = lgbm.Dataset(data=X_train_word_char[idx_trn, :], label=y_true[idx_trn, label_col])
            model = lgbm.train(
                params=params,
                train_set=dat_trn,
                num_boost_round=bst_boost_rounds,
                verbose_eval=verbose_eval,
            )
            print('    predict')
            preds_valid[idx_val, label_col] = model.predict(
                data=X_train_word_char[idx_val,:], num_iteration=bst_boost_rounds)
            preds_test[:,label_col] = model.predict(data=X_test_word_char, num_iteration=bst_boost_rounds) / K
            del model

    # ensemble cv score
    score = roc_auc_score(y_true=y_true, y_score=preds_valid, average='macro')
    print('\ncv score : {0}'.format(score))
    # divide data for ensemble
    for fold, (_, idx_val) in enumerate(idx_trn_val):
        preds_valid_fold = preds_valid[idx_val,:].T
        df_preds_val = pd.DataFrame()
        idx_val_set = set(idx_val)
        df_preds_val[ID_COL] = [ id for idx, id in enumerate(id_train) if idx in idx_val_set ]
        for idx, label in enumerate(label_candidates):
            df_preds_val[label] = preds_valid_fold[idx]
        df_preds_val.to_csv('../data/output/preds/gbdt/{0}fold_valid.csv'.format(fold), index=False)

    # record ensemble result
    preds_test = preds_test.T
    df_preds_test = pd.DataFrame()
    df_preds_test[ID_COL] = id_test
    for idx, label in enumerate(label_candidates):
        df_preds_test[label] = preds_test[idx]
    df_preds_test.to_csv('../data/output/preds/gbdt/avg_submit.csv', index=False)
    'num_leaves':[100],
    'bagging_fraction': [0.8],
    'bagging_freq':[5],
    'min_data_in_leaf':[10],
    'min_gain_to_split':[0],
    'num_iterations':[500],
    'lambda_l1':[1],
    'lambda_l2':[0.1],
    'verbose':[1]
#     'is_unbalance':[True]
}
params=list(ParameterGrid(params))
lgbtrain=lgb.Dataset(X_train,label=y_train[:,1])
lgbeval=lgb.Dataset(X_val,label=y_val[:,1],reference=lgbtrain)
best_ccc=0
for param in params:
    print(param)
    clf = lgb.cv(param, lgbtrain, nfold=5, num_boost_round=param['num_iterations'], \
                    early_stopping_rounds=50, feval=metric_ccc, verbose_eval=True)
    print(clf)
#     if clf.best_score['valid_0']['ccc value:']>best_ccc:
#         best_ccc=clf.best_score['valid_0']['ccc value:']
#         best_param=param
#         best_it=clf.best_iteration
#     print('noval best interation: '+str(clf.best_iteration))
# y_pred=clf.predict(X_val)
# y_pred2 = np.clip(correct(y_train[:,1], y_pred),-1,1)
# ccc2,_=calccc.ccc(y_val[:,1],correct(y_train[:,1], y_pred2))
# print('best ccc:',best_ccc,'(',ccc2,')')
# print('best param:',best_param)
# print('best iteration:',best_it)