class LGBMWrapper(BaseEstimator, RegressorMixin): def __init__(self, categorical_feature=None, **params): self.model = LGBMRegressor(**params) self.categorical_feature = categorical_feature def fit(self, X, y): with warnings.catch_warnings(): cats = None if self.categorical_feature is None else list( X.columns.intersection(self.categorical_feature)) warnings.filterwarnings( "ignore", "categorical_feature in Dataset is overridden".lower()) self.model.fit( X, y, **({} if cats is None else { "categorical_feature": cats })) self.feature_importances_ = self.model.feature_importances_ return self def predict(self, X): return self.model.predict(X) def get_params(self, deep=True): return { **self.model.get_params(deep), "categorical_feature": self.categorical_feature } def set_params(self, **params): ctf = params.pop("categorical_feature", None) if ctf is not None: self.categorical_feature = ctf self.model.set_params(params)
class LGBMUncertainty(BaseEstimator, RegressorMixin): def __init__(self, **kwargs): self.lgb = LGBMRegressor(**kwargs) def fit(self, X, y): self.lgb.fit(X, y) return self def predict(self, X, y=None): pred = self.lgb.predict(X, pred_leaf=True) ind_pred = [] for row in pred: ind_pred.append([ self.lgb.booster_.get_leaf_output(i, j) for i, j in enumerate(row) ]) ind_pred = np.vstack(ind_pred) pred_mean = ind_pred.sum(axis=1) pred_std = ind_pred.std(axis=1) return pred_mean, pred_std def get_params(self, deep=True): return self.lgb.get_params() def set_params(self, **params): self.lgb.set_params(**params) return self
def train_lgb(model=False): global log params = grid_search_lgb(True) clf = LGBMRegressor().set_params(**params) if model: return clf params = clf.get_params() log += 'lgb' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', n_estimators: %d' % params['n_estimators'] log += ', num_leaves: %d' % params['num_leaves'] log += ', min_split_gain: %.1f' % params['min_split_gain'] log += ', min_child_weight: %.4f' % params['min_child_weight'] log += ', min_child_samples: %d' % params['min_child_samples'] log += ', subsample: %.1f' % params['subsample'] log += ', colsample_bytree: %.1f' % params['colsample_bytree'] log += '\n\n' return train(clf)
colsample_bytree = 0.8 colsample_bynode = 0.8 min_child_samples = 180 max_bin = 256 Regressor = LGBMRegressor(n_estimators=100000, boosting_type=boosting_type, learning_rate=learning_rate, random_state=0, subsample=subsample, subsample_freq=subsample_freq, colsample_bytree=colsample_bytree, colsample_bynode=colsample_bynode, min_child_samples=min_child_samples, max_depth=max_depth, num_leaves=num_leaves, class_weight=class_weight, max_bin=max_bin, importance_type='split', objective=objective) Regressor.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=1000, ) best_iteration = Regressor.best_iteration_ print(Regressor.get_params()) Regressor = LGBMRegressor(n_estimators=best_iteration, boosting_type=boosting_type, learning_rate=learning_rate, random_state=0, class_weight=class_weight, min_child_samples=min_child_samples, colsample_bynode=colsample_bynode, colsample_bytree=colsample_bytree, subsample=subsample, subsample_freq=subsample_freq, max_bin=max_bin, max_depth=max_depth, num_leaves=num_leaves, top_k=1000, tree_learner='voting', importance_type='split', objective=objective) scores = cross_val_score(Regressor, X, y, cv=5, scoring='neg_mean_squared_error') print('Cross Validation Results: {} +- Std.Dev. {} '.format(scores.mean(), scores.std()))
def lgb_model(all_df): # 나중에 합칠 때 필요해서 test에 선언 test = all_df[58327370:] train_set_X = all_df[:58327370] train_set_y = train_set_X['target'] train_set_X = train_set_X[features] # 테스트 셋 test_set = all_df[58327370:] test_set = test_set[features] del all_df # model run n_fold = 5 folds = KFold(n_splits=5, shuffle=True) splits = folds.split(train_set_X, train_set_y) y_preds = np.zeros(test.shape[0]) y_oof = np.zeros(train_set_X.shape[0]) feature_importances = pd.DataFrame() feature_importances['feature'] = train_set_X.columns mean_score = [] for fold_n, (train_index, valid_index) in enumerate(splits): print('Fold:', fold_n + 1) X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[ valid_index] y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[ valid_index] lgb = LGBMRegressor(boosting_type='gbdt', num_leaves=4000, colsample_bytree=0.8, subsample=0.8, n_estimators=600, learning_rate=0.2, n_jobs=-1, device='gpu') lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=True) # 피쳐중요도 작성 feature_importances[f'fold_{fold_n + 1}'] = lgb.feature_importances_ # validation predict y_pred_valid = lgb.predict(X_valid, num_iteration=lgb.best_iteration_) y_oof[valid_index] = y_pred_valid val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid)) print(f'val rmse score is {val_score}') mean_score.append(val_score) y_preds += lgb.predict(test_set, num_iteration=lgb.best_iteration_) / n_fold del X_train, X_valid, y_train, y_valid print('mean rmse score over folds is', np.mean(mean_score)) test['target'] = y_preds # predict sub = pd.read_csv('inputs/sample_submission.csv') predictions = test[['id', 'date', 'target']] predictions = pd.pivot(predictions, index='id', columns='date', values='target').reset_index() predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)] evaluation_rows = [row for row in sub['id'] if 'evaluation' in row] evaluation = sub[sub['id'].isin(evaluation_rows)] validation = sub[['id']].merge(predictions, on='id') final = pd.concat([validation, evaluation]) final.to_csv('submissions/submission.csv', index=False) features = train_set_X.columns params = lgb.get_params() write_params_features(features, params)