예제 #1
0
class LGBMWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, categorical_feature=None, **params):
        self.model = LGBMRegressor(**params)
        self.categorical_feature = categorical_feature

    def fit(self, X, y):
        with warnings.catch_warnings():
            cats = None if self.categorical_feature is None else list(
                X.columns.intersection(self.categorical_feature))
            warnings.filterwarnings(
                "ignore",
                "categorical_feature in Dataset is overridden".lower())
            self.model.fit(
                X, y, **({} if cats is None else {
                    "categorical_feature": cats
                }))
            self.feature_importances_ = self.model.feature_importances_
            return self

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return {
            **self.model.get_params(deep), "categorical_feature":
            self.categorical_feature
        }

    def set_params(self, **params):
        ctf = params.pop("categorical_feature", None)
        if ctf is not None: self.categorical_feature = ctf
        self.model.set_params(params)
class LGBMUncertainty(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.lgb = LGBMRegressor(**kwargs)

    def fit(self, X, y):
        self.lgb.fit(X, y)
        return self

    def predict(self, X, y=None):
        pred = self.lgb.predict(X, pred_leaf=True)

        ind_pred = []
        for row in pred:
            ind_pred.append([
                self.lgb.booster_.get_leaf_output(i, j)
                for i, j in enumerate(row)
            ])
        ind_pred = np.vstack(ind_pred)

        pred_mean = ind_pred.sum(axis=1)
        pred_std = ind_pred.std(axis=1)

        return pred_mean, pred_std

    def get_params(self, deep=True):
        return self.lgb.get_params()

    def set_params(self, **params):
        self.lgb.set_params(**params)
        return self
예제 #3
0
def train_lgb(model=False):
    global log

    params = grid_search_lgb(True)

    clf = LGBMRegressor().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'lgb'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', num_leaves: %d' % params['num_leaves']
    log += ', min_split_gain: %.1f' % params['min_split_gain']
    log += ', min_child_weight: %.4f' % params['min_child_weight']
    log += ', min_child_samples: %d' % params['min_child_samples']
    log += ', subsample: %.1f' % params['subsample']
    log += ', colsample_bytree: %.1f' % params['colsample_bytree']
    log += '\n\n'

    return train(clf)
colsample_bytree = 0.8
colsample_bynode = 0.8
min_child_samples = 180
max_bin = 256

Regressor = LGBMRegressor(n_estimators=100000, boosting_type=boosting_type, learning_rate=learning_rate, random_state=0,
                          subsample=subsample, subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                          colsample_bynode=colsample_bynode, min_child_samples=min_child_samples,
                          max_depth=max_depth, num_leaves=num_leaves, class_weight=class_weight, max_bin=max_bin,
                          importance_type='split', objective=objective)
Regressor.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
              early_stopping_rounds=1000, )

best_iteration = Regressor.best_iteration_

print(Regressor.get_params())


Regressor = LGBMRegressor(n_estimators=best_iteration, boosting_type=boosting_type, learning_rate=learning_rate,
                          random_state=0, class_weight=class_weight, min_child_samples=min_child_samples,
                          colsample_bynode=colsample_bynode, colsample_bytree=colsample_bytree,
                          subsample=subsample, subsample_freq=subsample_freq, max_bin=max_bin,
                          max_depth=max_depth, num_leaves=num_leaves, top_k=1000, tree_learner='voting',
                          importance_type='split', objective=objective)


scores = cross_val_score(Regressor, X, y, cv=5,
                         scoring='neg_mean_squared_error')
print('Cross Validation Results: {} +- Std.Dev. {} '.format(scores.mean(), scores.std()))

def lgb_model(all_df):
    # 나중에 합칠 때 필요해서 test에 선언
    test = all_df[58327370:]

    train_set_X = all_df[:58327370]
    train_set_y = train_set_X['target']

    train_set_X = train_set_X[features]

    # 테스트 셋
    test_set = all_df[58327370:]
    test_set = test_set[features]

    del all_df

    # model run
    n_fold = 5
    folds = KFold(n_splits=5, shuffle=True)
    splits = folds.split(train_set_X, train_set_y)

    y_preds = np.zeros(test.shape[0])
    y_oof = np.zeros(train_set_X.shape[0])

    feature_importances = pd.DataFrame()
    feature_importances['feature'] = train_set_X.columns
    mean_score = []

    for fold_n, (train_index, valid_index) in enumerate(splits):
        print('Fold:', fold_n + 1)

        X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[
            valid_index]
        y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[
            valid_index]

        lgb = LGBMRegressor(boosting_type='gbdt',
                            num_leaves=4000,
                            colsample_bytree=0.8,
                            subsample=0.8,
                            n_estimators=600,
                            learning_rate=0.2,
                            n_jobs=-1,
                            device='gpu')
        lgb.fit(X_train,
                y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=50,
                verbose=True)

        # 피쳐중요도 작성
        feature_importances[f'fold_{fold_n + 1}'] = lgb.feature_importances_

        # validation predict
        y_pred_valid = lgb.predict(X_valid, num_iteration=lgb.best_iteration_)

        y_oof[valid_index] = y_pred_valid

        val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid))

        print(f'val rmse score is {val_score}')

        mean_score.append(val_score)

        y_preds += lgb.predict(test_set,
                               num_iteration=lgb.best_iteration_) / n_fold

        del X_train, X_valid, y_train, y_valid

    print('mean rmse score over folds is', np.mean(mean_score))
    test['target'] = y_preds

    # predict
    sub = pd.read_csv('inputs/sample_submission.csv')

    predictions = test[['id', 'date', 'target']]
    predictions = pd.pivot(predictions,
                           index='id',
                           columns='date',
                           values='target').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in sub['id'] if 'evaluation' in row]
    evaluation = sub[sub['id'].isin(evaluation_rows)]

    validation = sub[['id']].merge(predictions, on='id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submissions/submission.csv', index=False)

    features = train_set_X.columns
    params = lgb.get_params()
    write_params_features(features, params)