def setup(self, config): self.n_estimators = int(config.get("n_estimators", 200)) self.max_depth = config.get("max_depth", 3) self.subsample = config.get("subsample", 0.5) self.max_features = config.get("max_features", 0.5) self.learning_rate = config.get("learning_rate", 0.1) self.min_samples_split = config.get("min_samples_split", 2) self.min_samples_leaf = config.get("min_samples_leaf", 1) self.warm_start = config.get("warm_start", False) self.varw = config.get("varslist", ['met_tight_tst_et']) self.run_mode = config.get("run_mode", "local") from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, subsample=self.subsample, max_features=self.max_features, learning_rate=self.learning_rate, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, warm_start=self.warm_start) print(model.get_params()) self.model = model
def train_gb(x_train, y_train, x_test, y_test, x_val, y_val, gb_gridsearch): print('Training model gradient boosting with sklearn...') cls = GradientBoostingClassifier() if gb_gridsearch: print('Tuning parameters...') grid_params_gb = [{ 'learning_rate': [0.05], 'n_estimators': [1000], 'max_depth': [6], 'subsample': [1], 'min_samples_split': [2], 'min_samples_leaf': [1], 'max_features': ['sqrt'], 'verbose': [1] }] gs_gb = GridSearchCV(estimator=cls, param_grid=grid_params_gb, scoring='f1_weighted', cv=10, verbose=10, n_jobs=-1) gs_gb.fit(x_train, y_train) # Best params print('Best params: %s' % gs_gb.best_params_) # Best training data r2 print('Best training accuracy: %.3f' % gs_gb.best_score_) model = gs_gb.best_estimator_ #cls.set_params(**gs_gb.best_params_) #model = cls.fit(x_train, y_train) else: params_gb = { 'learning_rate': 0.05, 'n_estimators': 500, 'max_depth': 3, 'subsample': 1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'verbose': 2 } cls.set_params(**params_gb) model = cls.fit(x_train, y_train) print(print(cls.get_params())) print('Test predictions with trained mode...') y_pred = model.predict(x_test) print('Train predictions with trained mode...') y_pred_t = model.predict(x_train) print('Validation predictions with trained mode...') y_pred_val = model.predict(x_val) print('Confussion matrix test:') print(confusion_matrix(y_test, y_pred)) print('Confussion matrix validation:') print(confusion_matrix(y_val, y_pred_val)) print('Prediction accuracy for test: %.3f ' % accuracy_score(y_test, y_pred)) print('Prediction accuracy for train: %.3f ' % accuracy_score(y_train, y_pred_t)) print('Prediction accuracy for validation: %.3f ' % accuracy_score(y_val, y_pred_val)) return model
def train_gbdt(model=False): print('# train_gbdt') global log params = grid_search_gbdt(True) clf = GradientBoostingClassifier().set_params(**params) if model: return clf params = clf.get_params() log += 'gbdt' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', n_estimators: %d' % params['n_estimators'] log += ', max_depth: %d' % params['max_depth'] log += ', min_samples_split: %d' % params['min_samples_split'] log += ', min_samples_leaf: %d' % params['min_samples_leaf'] log += ', subsample: %.1f' % params['subsample'] log += '\n\n' model = train(clf) file = open('gdbt-model.pkl', 'wb') pickle.dump(model, file) file.close() print('# train_gbdt end') return train(clf)
def search_bestparam_GradientBoostingClassifier(X, y, df_search_best_param): print(f"Search best params for GradientBoostingClassifier ...") model = GradientBoostingClassifier() print("Supported params", model.get_params()) param_grid = { 'n_estimators': [1, 10, 100, 1000], 'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5], 'subsample': [0.1, 0.5, 1.0], 'max_depth': [1, 3, 5, 10, 20, 50, 100] } search_bestparam(model, param_grid, X, y, df_search_best_param)
def test_gradient_precise(): x_train, y_train, x_test, y_test = load_data() train_size = x_train.shape[0] best_estimators = 20 best_learning = 0.2 best_subsamples = 0.6 clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.2, subsample=0.6, max_depth=3, max_features=3) clf.fit(x_train, y_train) best_accuracy = clf.score(x_test, y_test) print(best_accuracy, best_estimators, best_learning, best_subsamples) counter = 0 for estimators in range(15, 35): for learning in range(12, 25): learning *= 0.01 for subsample in range(45, 65, 1): subsample *= 0.01 clf = GradientBoostingClassifier(n_estimators=estimators, learning_rate=learning, subsample=subsample, max_depth=3, max_features=3) clf.fit(x_train, y_train) accuracy = clf.score(x_test, y_test) counter += 1 if counter % 10 == 0: print(str(counter) + ' completed.') if accuracy > best_accuracy: print(clf.get_params()) print('New best accuracy: ' + str(accuracy)) best_accuracy = accuracy # best_estimators = estimators # best_learning = learning # best_subsamples = subsample # print(best_accuracy, best_estimators, best_learning, best_subsamples) print(best_accuracy, best_estimators, best_learning, best_subsamples)
def train(): X = load_npz("train.npz")[:10] print X.shape y = np.fromfile("train.label", dtype=np.int64)[:10] print y.shape X_train, X_val, y_train, y_val = train_test_split( X, y, train_size=0.95, random_state=1314) gbct = GradientBoostingClassifier(max_depth=2, n_estimators=30, warm_start=True) gbct.fit(X_train, y_train) model = gbct.get_params() pickle.dump(model, "gbct.ml")
def gboost_Classifier(): clf = GradientBoostingClassifier(random_state=42,verbose = 1,n_estimators=200,max_depth=4) print("fitting ...") clf.fit(X_train, y_train.values.ravel()) print("fitted ...") result = clf.predict(test) # print (result) data = pd.read_csv('./data/submit_sample.csv') data['predict'] = result data.to_csv('./result/gboost_submission.csv', index=False) print(clf.score(X_val, y_val)) # 86.84 # 输出各个特征项的重要指数 print(clf.feature_importances_) # 输出各个参数 f print(clf.get_params())
def train_gbdt(model=False): global log params = grid_search_gbdt(True) clf = GradientBoostingClassifier().set_params(**params) if model: return clf params = clf.get_params() log += 'gbdt' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', n_estimators: %d' % params['n_estimators'] log += ', max_depth: %d' % params['max_depth'] log += ', min_samples_split: %d' % params['min_samples_split'] log += ', min_samples_leaf: %d' % params['min_samples_leaf'] log += ', subsample: %.1f' % params['subsample'] log += '\n\n' return train(clf)
def gridsearch_gbc( X_train, y_train, resamp, resample_wt=.5, learning_rate=.1, max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0, loss='deviance', criterion='friedman_mse', ): X_train, y_train = ms.oversample(X_train, y_train, resamp) model = GradientBoostingClassifier( learning_rate=learning_rate, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, subsample=subsample, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, loss=loss, criterion=criterion, ) model.fit(X_train, y_train) y_pred = model.predict(X_test) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) return model.get_params(), recall, precision, f1
class GBDT_LR(BaseEstimator, ClassifierMixin): def __init__(self, gbdt_params: Dict, lr_params: Dict): self.gbdt = GradientBoostingClassifier(**gbdt_params) self.lr = LogisticRegression(**lr_params) self.enc = OneHotEncoder() def fit(self, X, y=None): self.gbdt.fit(X, y) indices = self.gbdt.apply(X).reshape(-1, self.gbdt.n_estimators) self.enc.fit(indices) self.lr.fit(self.enc.transform(indices), y) return self def predict_proba(self, X): indices = self.gbdt.apply(X).reshape(-1, self.gbdt.n_estimators) return self.lr.predict_proba(self.enc.transform(indices)) def predict(self, X): scores = self.predict_proba(X) return self.lr.classes_[np.argmax(scores, axis=1).reshape(-1)] def get_params(self, deep=True): return {"gbdt": self.gbdt.get_params(), "lr": self.lr.get_params()}
print(scores_df) sns.factorplot('Scores', 'Classifier', data=scores_df, size=6) # Two best classifiers happened to be Gradient Boost and Logistic Regression. Since Logistic Regression got sligthly lower score and is rather easily overfitted, we will use Gradient Boost. # ### Selecting the parameters # Now that we've chosen the algorithm, we need to select the best parameters for it. There are many options, and sometimes it's almost impossible to know the best set of parameters. That's why we will use Grid Search to test out different options and choose the best ones. # # But first let's take a look at all the possible parameters of Gradient Boosting classifier: # In[ ]: g_boost = GradientBoostingClassifier() g_boost.get_params().keys() # We will test different options for min_samples_leaf, min_samples_split, max_depth, and loss parameters. I will set n_estimators to 100, but it can be increased since Gradient Boosting algorithms generally don't tend to overfit. # In[ ]: from sklearn.model_selection import GridSearchCV param_grid = { 'loss': ['deviance', 'exponential'], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10], 'n_estimators': [100], 'max_depth': [3, 5, 10, 20]
if __name__ == "__main__": if load_sample: header, x_array, label = load_obj(sample_save_file) else: header, x_array, label, _ = read_features(feature_file) save = [header, x_array, label] save_obj(save, sample_save_file) train_set, test_set, _ = split_train_test(seed, x_array, label, 0.9) train_x, train_y = train_set # print "size of train_x is", train_x.shape gbrt = GradientBoostingClassifier() parameter_grid = {'n_estimators' : range(10, 110, 10), 'learning_rate': uniform(loc = 0.01, scale = 0.99), 'max_depth' : range(1, 6, 1), 'max_features' : uniform(loc = 0.1, scale = 0.89), 'subsample' : uniform(loc = 0.2, scale = 0.79), 'min_samples_leaf' : [min_samples_leaf], 'min_samples_split' : [min_samples_split]} gbrt_gridsearch = grid_search.RandomizedSearchCV(gbrt, \ parameter_grid, scoring = auc, \ cv=4, n_jobs=2, n_iter=100, verbose=5, refit=False) gbrt_gridsearch.fit(train_x, train_y) gbrt_best = GradientBoostingClassifier(verbose=2, **gbrt_gridsearch.best_params_) gbrt_best.fit(train_x, train_y) print "parameters:", gbrt_best.get_params() save_obj(gbrt_best, model_save_file)
dire_flying_courier_time 71132 ----' acquired by "dire" in the part of battles dire_first_ward_time 95404 ---------> subject "ward" was not used by "dire" ''' # filling NA with 0 with accordance with task description #3 X_train = features.fillna(value = 0, axis = 'columns') # definition of a target variable #4 y_train = train_df['radiant_win'] print '\nradiant_win' print "\n", "GradientBoostingClassifier:" # using KFold by task description #5 kf = KFold(totalLenght, n_folds = 5, shuffle = True, random_state = 1013) # learning with defualt GradientBoostingClassifier settings clf = GradientBoostingClassifier(random_state = 1013) params = clf.get_params() start_time = datetime.now() score = cross_val_score(estimator = clf, X=X_train, y=y_train, scoring='roc_auc', cv=kf).mean() print '\tn_estimators:', params['n_estimators'],\ '\tmax_depth:', params['max_depth'],\ '\tscore:', score,\ '\ttimeElapsed:', datetime.now() - start_time #looks like good result for a long-long very long time ''' n_estimators: 100 max_depth: 3 score: 0.70661221449 timeElapsed: 0:07:13.214000 ''' # try to found out compromise beetwen good score in execution time for maxDepth in [1,2,3,5]: for treeCount in [10, 20, 30, 40, 50, 80, 100, 200, 500]:
return gsearch.best_params_ if __name__ == '__main__': train, test = load_data() gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features='sqrt') model_fit(train, {'n_estimators': 100}) param_search = {'n_estimators': range(50, 220, 30)} param_find = grid_search(gbdt_model, param_search, train) gbdt_model.set_params(**param_find) param_search = { 'max_depth': range(3, 14, 2), 'min_samples_split': range(1, 301, 50) } param_find = grid_search(gbdt_model, param_search, train) gbdt_model.set_params(**param_find) param_search = {'min_samples_leaf': range(1, 101, 20)} param_find = grid_search(gbdt_model, param_search, train) gbdt_model.set_params(**param_find) gbdt_model = model_fit(train, gbdt_model.get_params())
# - Example: `parameters = {'parameter' : [list of values]}`. # - **Note:** Avoid tuning the `max_features` parameter of your learner if that parameter is available! # - Use `make_scorer` to create an `fbeta_score` scoring object (with $\beta = 0.5$). # - Perform grid search on the classifier `clf` using the `'scorer'`, and store it in `grid_obj`. # - Fit the grid search object to the training data (`X_train`, `y_train`), and store it in `grid_fit`. # # **Note:** Depending on the algorithm chosen and the parameter list, the following implementation may take some time to run! # In[14]: # TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries from sklearn.grid_search import GridSearchCV from sklearn.metrics import make_scorer, r2_score, fbeta_score # TODO: Initialize the classifier clf = GradientBoostingClassifier(random_state=100) clf.get_params() # TODO: Create the parameters list you wish to tune, using a dictionary if needed. # HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]} parameters = {'n_estimators': [50, 100, 200], 'learning_rate': [0.5, 1.5, 2.5]} # TODO: Make an fbeta_score scoring object using make_scorer() scorer = make_scorer(fbeta_score, beta=0.5) # TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV() grid_obj = GridSearchCV(clf, parameters, scoring=scorer) # TODO: Fit the grid search object to the training data and find the optimal parameters using fit() grid_fit = grid_obj.fit(X_train, y_train) # Get the estimator best_clf = grid_fit.best_estimator_
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y0[:, 0]) plt.show() pca_pipe = make_pipeline(StandardScaler(), PCA(10)) X_pca = pca_pipe.fit_transform(X) y0 = y.reset_index().drop('index', 1).values X_embedded = TSNE().fit_transform(X_pca) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y0[:, 0]) plt.show() #%% xtr, xte, ytr, yte = train_test_split(X, y, train_size=0.75) gbc = GradientBoostingClassifier() gbc.get_params() bayes = BayesSearchCV() #%% xtr, xte, ytr, yte = train_test_split(X, y, train_size=0.75) model_gnb = GaussianNB() model_gnb.fit(xtr, ytr) bayes_score = model_gnb.score(xte, yte) model_lr = LogisticRegression() model_lr.fit(xtr, ytr) logreg_score = model_lr.score(xte, yte) model_rc = RidgeClassifier()
from sklearn.ensemble import GradientBoostingClassifier gbc_name = 'GradientBoostingClassifier' gbc_params_grid = { 'learning_rate': [0.01], # , 0.1, 0.05, 0.5], 'loss': ['deviance'], # , 'exponential'], 'max_depth': [3, 5], # , 10], 'max_features': [None, 'auto'], # , 'sqrt', 'log2'], 'n_estimators': [200] # , 10, 50, 100] } gbc = GradientBoostingClassifier(random_state=42) if __name__ == "__main__": print(gbc.get_params())
from scipy.stats import randint from sklearn.model_selection import RandomizedSearchCV param_dist = {"max_depth":[10,9,8,7,6,5,4,3,2,1,None], "learning_rate":[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09]} gbrt = GradientBoostingClassifier() gbrt_cv = RandomizedSearchCV(gbrt,param_dist,cv=5) gbrt_cv.fit(X_train,y_train) print("Tuned Decision Tree Parameters:{}".format(gbrt_cv.best_params_)) print("Best score is {}".format(gbrt_cv.best_score_)) #%% gbrt.get_params().keys() #%%Hyperparameter tuning --GridsearchCV from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler steps = [('GBRT', GradientBoostingClassifier())] pipeline = Pipeline(steps) pipeline.fit(X_train,y_train) print("Test score:{:.2f}".format(pipeline.score(X_test,y_test)))
# Print the feature ranking print("Feature ranking:") for f in range(x2.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(x2.shape[1]), importances[indices], color="r", align="center") plt.xticks(range(x2.shape[1]), indices) plt.xlim([-1, x2.shape[1]]) plt.show() feature_imp(X) clf.get_params() param_grid = [ {'learning_rate': [0.05, 0.1, 0.2, 0.25], 'max_depth': [3,4,5,6], 'min_samples_leaf': [1,2], 'n_estimators': [100,200,300]}, ] svr = GradientBoostingClassifier() from sklearn import grid_search clf = grid_search.GridSearchCV(svr, param_grid) clf.fit(x2,training_target) print("Best parameters set found on development set:") print() print(clf.best_params_) print()
y_pred_gbdt = gbdt_model.predict_proba(X_valid)[:, 1] log_loss_gbdt = log_loss(y_valid, y_pred_gbdt) print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt) ## store the pre-trained gbdt_model pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb')) del X_train_gbdt del y_train_gbdt gc.collect() gbdt_model = pickle.load(open(fp_gbdt_model, 'rb')) #----- data for LR (one-hot encoding with GDBT output) -----# id_cols = [] for i in range(1, gbdt_model.get_params()['n_estimators']+1): id_cols.append('tree'+str(i)) oh_enc = OneHotEncoder(id_cols) def chunker(seq, size): return (seq[pos: pos + size] for pos in range(0, len(seq), size)) ## oh_enc fit the train_set df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8) for chunk in chunker(df_train_id, 50000): oh_enc.fit(chunk) del df_train_id del X_train_org
# A parameter grid for XGBoost params = { 'max_depth': [3], 'subsample': [0.8], 'n_estimators': [150], 'max_leaf_nodes': [80], } # In[47]: roc_auc_scorer = make_scorer(roc_auc_score) # In[48]: XG_clf.get_params().keys() # In[49]: grid_search = GridSearchCV(XG_clf, params, scoring=roc_auc_scorer, cv=5, return_train_score=True) grid_search.fit(X_train, y_train) # In[50]: grid_search_1 = GridSearchCV(XG_clf, params,
as the winner of f1 score is the GradientBoostingClassifier, and f1 score takes both recall and precision into account, and it also wins in the accuracy test. As I think here false negatives and false negatives are of crucial role, remember high quality wines otnumber low quality by a big margin, there are circa 6 times more low quality wines than high So the double Winning GradientBoostingClassifier is a go to there/ """ from sklearn.model_selection import RandomizedSearchCV, GridSearchCV X, y = attributes, features model = GradientBoostingClassifier() default_params = model.get_params() print( default_params ) #aby wiedziec co mozna zmieniac, pozniej i tak musialem czytac dokumentacje, aby wiedziec co jest czym.. #%% #LICZY OKOLO GODZINE, poniezej moje wyniki, #nie skorzystalem z grid search, ponieważ za duza 'mapa' parametrow #i szukanie trwaloby wiele lat parameters = { "loss": ["deviance"], "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], "min_samples_split": np.linspace(0.1, 0.5, 12), "min_samples_leaf": np.linspace(0.1, 0.5, 12), "max_depth": [3, 5, 8], "max_features": ["log2", "sqrt"], "criterion": ["friedman_mse", "mae"],
print(f'The f1 score for our train dataset is {f1_score(y_train, y_pred_gbc_train)}') print(confusion_matrix(y_train, y_pred_gbc_train)) # using only 10 features yield a more accurate model than using all features """### Hyperparameter Tuning """ # let us see what our optimal parameters for our random forest are # Look at parameters used by our current forest gradient = GradientBoostingClassifier(random_state = 42) # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(gradient.get_params()) #We will try adjusting the following set of hyperparameters: #n_estimators = number of trees in the foreset #max_depth = max number of levels in each decision tree #min_samples_split = min number of data points placed in a node before the node is split #learning rate: shrinks the contribution of each tree from sklearn.model_selection import RandomizedSearchCV # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)] # Maximum number of levels in tree max_depth = [5,10,15,20] # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # learning rate
class gbClf(BaseModel): """Model using random forest classifier.""" def __init__(self, train_data_fname=None, nrows=None, **kwargs): """Initialize the data frame.""" super(gbClf, self).__init__(train_data_fname, nrows, **kwargs) def set_model(self, **kwargs): """Set the classifier.""" verbose = kwargs.get('verbose', 0) n_estimators = kwargs.get('n_estimators', 3000) max_depth = kwargs.get('max_depth', 3) min_samples_leaf = kwargs.get('min_samples_leaf', 1) min_samples_split = kwargs.get('min_samples_split', 2) max_features = kwargs.get('max_features', None) learning_rate = kwargs.get('learning_rate', 0.1) subsample = kwargs.get('subsample', 1.0) random_state = kwargs.get('random_state', 24) self.learner = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, max_features=max_features, verbose=verbose, random_state=random_state) print('\n\nGradient Boosting set with parameters:') par_dict = self.learner.get_params() for ipar in par_dict.keys(): print('{}: {}'.format(ipar, par_dict[ipar])) print('\n\n') def fitNscore(self, **kwargs): """Fit classifier and produce score and related plots.""" col2fit = kwargs.get('features') # cleaning bids_path = kwargs.get('bids_path', 'data/bids.csv') if not self.iscleaned: print 'Preparing the data...' self.prepare_data(bids_path, **kwargs) print('columns for fit=\n{}'.format(self.df_train.columns)) test_size = 0.2 # fraction kept for testing rnd_seed = 24 # for reproducibility #features_train, features_test, target_train, target_test =\ # train_test_split(self.df_train[col2fit].values, # self.df_train['outcome'].values, # test_size=test_size, # random_state=rnd_seed) sss = StratifiedShuffleSplit(self.df_train['outcome'].values, n_iter=1, test_size=test_size, random_state=rnd_seed) for train_index, test_index in sss: features_train = self.df_train[col2fit].values[train_index] features_test = self.df_train[col2fit].values[test_index] target_train = self.df_train['outcome'].values[train_index] target_test = self.df_train['outcome'].values[test_index] # Fit Classifier self.fitModel(features_train, target_train, **kwargs) # Predict on the rest of the sample print('\nPredicting...') predictions = self.learner.predict(features_test) probas = self.learner.predict_proba(features_test) # Feature index ordered by importance ord_idx = np.argsort(self.learner.feature_importances_) print("Feature ranking:") for ifeaturindex in ord_idx[::-1]: print('{0} \t: {1}'.format(col2fit[ifeaturindex], round(self.learner.feature_importances_[ifeaturindex], 2))) # Score print('(Self) Score={}'.format(self.learner.score(features_test, target_test))) # Plots # Feature importances maxfeat2show = 30 # number of features to show in plots importances = self.learner.feature_importances_ #std = np.std([tree.feature_importances_ for tree in self.learner.estimators_],axis=0) indices = np.argsort(importances)[::-1] indices = indices[:min(maxfeat2show, len(indices))] # truncate if > maxfeat2show ordered_names = [col2fit[i] for i in indices] fig_import = plt.figure(figsize=(10, 10)) plt.title("Feature importances, GB") #plt.barh(range(len(indices)), importances[indices], # color="b", xerr=std[indices], align="center",ecolor='r') plt.barh(range(len(indices)), importances[indices], color="b", align="center") plt.yticks(range(len(indices)), ordered_names) plt.ylim([-1, len(indices)]) plt.ylim(plt.ylim()[::-1]) plt.subplots_adjust(left=0.22) fig_import.show() # confusion matrix cm = confusion_matrix(target_test.astype(int), predictions.astype(int)) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm_normalized = np.clip(cm_normalized, 0.0, 0.5) fig_cm = plt.figure() ax_cm = fig_cm.add_subplot(1,1,1) im_cm = ax_cm.imshow(cm_normalized, interpolation='nearest') plt.title('Normalized confusion mtx, GB') plt.xlabel('Predicted') plt.ylabel('True') fig_cm.colorbar(im_cm) fig_cm.show() # ROC curve # This ones seems to reflect better the LB score #false_pos, true_pos, thr = roc_curve(target_test, predictions) false_pos, true_pos, thr = roc_curve(target_test, probas[:, 1]) roc_auc = auc(false_pos, true_pos) fig_roc = plt.figure() plt.plot(false_pos, true_pos, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") fig_roc.show() print('ROC_AUC = {}'.format(roc_auc)) raw_input('press enter when finished...')
def main(): # n_estimators = int(sys.argv[1]) max_depth = int(sys.argv[2]) learning_rate = float(sys.argv[3]) # n_estimators = 100 # max_depth = 3 # learning_rate = 0.3 cls = GradientBoostingClassifier(n_estimators= n_estimators, max_depth= max_depth, loss='deviance', learning_rate= learning_rate , min_samples_split = 10, min_samples_leaf= 10, subsample=0.8,verbose=2) cls.fit(X_train,y_train, monitor=Monitor(X_vldt,y_vldt)) # cls.fit(X_train,y_train) print 'vldt auc = ' , roc_auc_score(y_vldt, cls.predict_proba(X_vldt)[:,1] ), ', vars = ' , cls.get_params() with open('out.csv','w') as f: f.write('instance_id,prob\n') tests = get_files(test_file) for t in tests: print t X_test , y_test = load_svmlight_file(t) proba = cls.predict_proba(X_test)[:,1] for i ,v in enumerate(y_test): f.write('{0},{1}\n'.format(int(v), proba[i])) subprocess.check_call('cat out.csv | sort -t, -k1 -n >out.sorted.csv',shell=True) subprocess.check_call('cp out.sorted.csv out.sorted.{0}-{1}-{2}.csv'.format(n_estimators, max_depth, learning_rate),shell=True)
'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'criterion': criterion, 'learning_rate': learning_rate, 'loss': loss, 'max_leaf_nodes': max_leaf_nodes, } RTClassifier = RandomForestClassifier(random_state=42) pprint(RTClassifier.get_params()) xgbooster = GradientBoostingClassifier(random_state=42) pprint(xgbooster.get_params()) hyperparameters(classifier=xgbooster, X=X, y=y, grid=xgboost_grid, iterations=10000, fold=4) hyperparameters(classifier=RTClassifier, X=X, y=y, grid=random_grid, iterations=10000, fold=4)
classifier = GradientBoostingClassifier(n_estimators=500, learning_rate=0.075) #classifier.fit(X_train, y_train) #y_pred = classifier.predict(X_val) # #from sklearn.metrics import confusion_matrix #from sklearn.metrics import accuracy_score ##y_pred = classifier.predict(X_val) #cm = confusion_matrix(y_val, y_pred) #accuracy = accuracy_score(y_val, y_pred) #Grid Search #from sklearn.model_selection import GridSearchCV #parameters = { # 'n_estimators' : [200, 400] #} # #grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10) #grid_search = grid_search.fit(X_train, y_train) #best_score = grid_search.best_score_ #best_parameters = grid_search.best_params_ model_performance = pd.DataFrame() from sklearn.model_selection import cross_val_score kcv = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10) model_performance.append([kcv]) mean_kcv = kcv.mean() std_kcv = kcv.std() classifier.get_params()
y, test_size=0.33, random_state=20) boostc = GradientBoostingClassifier(n_estimators=30, max_depth=6) s1 = time.time() boostc.fit(X_train, y_train) e1 = time.time() # s2=time.time() # y_prd=boostc.predict(X_test) # e2=time.time() t1 = e1 - s1 #t2=e2-s2 y_test = np.array(y_test) #y_prd = np.array(y_prd) print(boostc.get_params()) #print('Test Accuracy: %.8f' % accuracy_score(y_test,y_prd)) print("Training time: ", t1) #print("Testing time: ",t2) plot_learning_curve(boostc, 'Learning Curve for boosting', X_train, y_train, (0, 1.01), cv=5) boostc1 = GradientBoostingClassifier() plot_validation_curve(X_train, y_train, boostc1, 'k1') plot_validation_curve(X_train, y_train, boostc1, 'k2') #plot_validation_curve(X_train,y_train,boostc1,'k3') # plot_validation_curve(X_train,y_train,clf1,'dtree3')
def train_test_model(df_model, predictor='max_dt_3days', model_type='quantile_discharge', loss='far'): performance_scores = pd.DataFrame() # loop over districts for district in df_model.district.unique(): df_district = df_model[df_model['district'] == district] if df_district.flood.nunique() < 2: continue if model_type == 'quantile_discharge': performance_model = pd.DataFrame( columns=['parameters', 'pod', 'far', 'pofd', 'csi']) # loop over stations and test all possible quantiles for station in df_district['station'].unique(): df_station = df_district[df_district['station'] == station] extreme_dis = df_station.set_index('time')[predictor].groupby( pd.Grouper(freq='6M')).max() for q in range(50, 100): threshold = extreme_dis.quantile(q / 100.) df_station['predictions'] = np.where( (df_station[predictor] >= threshold), 1, 0) perf = df_station.groupby(['district', 'station']).\ apply(lambda row: calc_performance_scores(row['flood'], row['predictions'])) perf['parameters'] = str((station, str(q))) perf['district'] = district performance_model = performance_model.append( perf, ignore_index=True) # find the couple (station, quantile) that minimizes loss function best_performance = performance_model.iloc[ performance_model[loss].idxmin] # save performance performance_scores = performance_scores.append(best_performance, ignore_index=True) elif model_type == 'bdt_discharge': # prepare training data X, y = [], [] df_ordered = df_district.groupby(['station', 'time'])[predictor].max() for time in df_district.time.unique(): X.append([ df_ordered.loc[(station, time)] for station in df_district.station.unique() ]) y.append(df_district[df_district['time'] == time] ['flood'].values[0]) # train and predict model = GradientBoostingClassifier(max_features='auto', loss='exponential') sample_weight = [len(y) / y.count(i) for i in y] model.fit(X, y, sample_weight) predictions = model.predict(X) # save performance best_performance = calc_performance_scores(pd.Series(y), pd.Series(predictions)) best_performance['parameters'] = str(model.get_params()) best_performance['district'] = district performance_scores = performance_scores.append(best_performance, ignore_index=True) return performance_scores
y_pred_gbdt = gbdt_model.predict_proba(X_valid)[:, 1] log_loss_gbdt = log_loss(y_valid, y_pred_gbdt) print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt) ## store the pre-trained gbdt_model pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb')) del X_train_gbdt del y_train_gbdt gc.collect() gbdt_model = pickle.load(open(fp_gbdt_model, 'rb')) #----- data for LR (one-hot encoding with GDBT output) -----# id_cols = [] for i in range(1, gbdt_model.get_params()['n_estimators'] + 1): id_cols.append('tree' + str(i)) oh_enc = OneHotEncoder(id_cols) def chunker(seq, size): return (seq[pos:pos + size] for pos in range(0, len(seq), size)) ## oh_enc fit the train_set df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8) for chunk in chunker(df_train_id, 50000): oh_enc.fit(chunk)
class RuleFitCustom(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_size: Number of terminal nodes in generated trees. If exp_rand_tree_size=True, this will be the mean number of terminal nodes. sample_fract: fraction of randomly chosen training observations used to produce each tree. FP 2004 (Sec. 2) max_rules: approximate total number of rules generated for fitting. Note that actual number of rules will usually be lower than this due to duplicates. memory_par: scale multiplier (shrinkage factor) applied to each new tree when sequentially induced. FP 2004 (Sec. 2) rfmode: 'regress' for regression or 'classify' for binary classification. lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2 by multiplying the winsorised variable by 0.4/stdev. lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear terms before standardisation. exp_rand_tree_size: If True, each boosted tree will have a different maximum number of terminal nodes based on an exponential distribution about tree_size. (Friedman Sec 3.3) model_type: 'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms random_state: Integer to initialise random objects and provide repeatability. tree_generator: Optional: this object will be used as provided to generate the rules. This will override almost all the other properties above. Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, rfmode='regress', lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, model_type='rl', Cs=None, cv=3, random_state=None, simple_rules=False): self.tree_generator = tree_generator self.rfmode = rfmode self.lin_trim_quantile = lin_trim_quantile self.lin_standardise = lin_standardise self.friedscale = FriedScale(trim_quantile=lin_trim_quantile) self.exp_rand_tree_size = exp_rand_tree_size self.max_rules = max_rules self.sample_fract = sample_fract self.memory_par = memory_par self.tree_size = tree_size self.random_state = random_state self.model_type = model_type self.cv = cv self.Cs = Cs self.simple_rules = simple_rules # TODO mettre en param global, on veut pouvoir faire dans le fit pour garder les mêmes nodes et comparaison plus facile du coup def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided N = X.shape[0] if feature_names is None: self.feature_names = [ 'feature_' + str(x) for x in range(0, X.shape[1]) ] else: self.feature_names = feature_names if 'r' in self.model_type: ## initialise tree generator if self.tree_generator is None: n_estimators_default = int( np.ceil(self.max_rules / self.tree_size)) self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N) if self.rfmode == 'regress': self.tree_generator = GradientBoostingRegressor( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) else: self.tree_generator = GradientBoostingClassifier( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) if self.rfmode == 'regress': if type(self.tree_generator) not in [ GradientBoostingRegressor, RandomForestRegressor ]: raise ValueError( "RuleFit only works with RandomForest and BoostingRegressor" ) else: if type(self.tree_generator) not in [ GradientBoostingClassifier, RandomForestClassifier ]: raise ValueError( "RuleFit only works with RandomForest and BoostingClassifier" ) ## fit tree generator if not self.exp_rand_tree_size: # simply fit with constant tree size self.tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(self.random_state) tree_sizes = np.random.exponential( scale=self.tree_size - 2, size=int(np.ceil(self.max_rules * 2 / self.tree_size))) tree_sizes = np.asarray([ 2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes)) ], dtype=int) i = int(len(tree_sizes) / 4) while np.sum(tree_sizes[0:i]) < self.max_rules: i = i + 1 tree_sizes = tree_sizes[0:i] self.tree_generator.set_params(warm_start=True) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] self.tree_generator.set_params(n_estimators=curr_est_ + 1) self.tree_generator.set_params(max_leaf_nodes=size) random_state_add = self.random_state if self.random_state else 0 self.tree_generator.set_params( random_state=i_size + random_state_add ) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. self.tree_generator.get_params()['n_estimators'] self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 self.tree_generator.set_params(warm_start=False) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance( self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list=tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform( X, weigh_rules=self.simple_rules) self.X_rules = X_rules #if self.simple_rules: # for i in range(0, X_rules.shape[1]): # X_rules[:, i] = X_rules[:, i]/len(self.rule_ensemble.rules[i].conditions) ## standardise linear variables if requested (for regression model only) if 'l' in self.model_type: if self.lin_standardise: self.friedscale.train(X) X_regn = self.friedscale.scale(X) else: X_regn = X.copy() ## Compile Training data X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: X_concat = np.concatenate((X_concat, X_regn), axis=1) if 'r' in self.model_type: if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) self.X_concat = X_concat ## fit Lasso if self.rfmode == 'regress': if self.Cs is None: # use defaultshasattr(self.Cs, "__len__"): n_alphas = 100 alphas = None elif hasattr(self.Cs, "__len__"): n_alphas = None alphas = 1. / self.Cs else: n_alphas = self.Cs alphas = None self.lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=self.cv, random_state=self.random_state) self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_ self.intercept_ = self.lscv.intercept_ else: Cs = 10 if self.Cs is None else self.Cs self.lscv = LogisticRegressionCV(Cs=Cs, cv=self.cv, penalty='l1', random_state=self.random_state, solver='liblinear') self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_[0] self.intercept_ = self.lscv.intercept_[0] return self def predict(self, X): """Predict outcome for X """ X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len( self.rule_ensemble.rules ):] # bug correction. upstreamed at https://github.com/christophM/rulefit/issues/23 if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform( X, coefs=rule_coefs, weigh_rules=self.simple_rules) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self, exclude_zero_coef=False): """Return the estimated rules Parameters ---------- exclude_zero_coef: If True (default), returns only the rules with an estimated coefficient not equalt to zero. Returns ------- rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds the coefficients and 'support' the support of the rule in the training data set (X) """ n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] ## Add coefficients for linear effects for i in range(0, n_features): if self.lin_standardise: coef = self.coef_[i] * self.friedscale.scale_multipliers[i] else: coef = self.coef_[i] output_rules += [(self.feature_names[i], 'linear', coef, 1, 0) ] # TODO REMOVE, pour debug ## Add rules for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] output_rules += [(rule.__str__(), 'rule', coef, rule.support, i) ] # TODO REMOVE, pour debug rules = pd.DataFrame( output_rules, columns=["rule", "type", "coef", "support", "rule_number"]) # TODO REMOVE, pour debug if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules def rules_complexity(self): n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) res = 0 for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] if (coef != 0): res += len(rule.conditions) return res
plt.show() # Gradient Boost Classifier print("\n") print("Gradient Boost") gb = GradientBoostingClassifier(loss='exponential', max_depth=3, learning_rate=0.01, n_estimators=100, subsample=1.0, criterion='mae', min_samples_split=2, min_samples_leaf=1, max_features='auto', random_state=5) print(gb.get_params()) gb.fit(X_train, y_train) y_pred = gb.predict(X_test) print_cm_stats(y_pred, y_test) # ROC curve fig, ax = plt.subplots(1, 1, figsize=(6, 6)) plot_roc_curve(X_test, y_test, gb, ax) ax.set_title("Gradient Boost") plt.show() # Feature importances fig, ax = plt.subplots(1, 1, figsize=(6, 6)) plot_feature_importances(df.columns, gb, ax) ax.set_title("Gradient Boost") plt.show()
log_classifier_summary(gbc, X_train, X_test, y_train, y_test) # tests exp = neptune.get_experiment() # check logs correct_logs_set = {'charts_sklearn'} for name in ['precision', 'recall', 'fbeta_score', 'support']: for i in range(10): correct_logs_set.add('{}_class_{}_test_sklearn'.format(name, i)) from_exp_logs = set(exp.get_logs().keys()) assert correct_logs_set == from_exp_logs, '{} - incorrect logs'.format(exp) # check sklearn parameters assert set(exp.get_properties().keys()) == set( gbc.get_params().keys()), '{} parameters do not match'.format(exp) # check neptune parameters assert set(exp.get_parameters().keys()) == set( parameters.keys()), '{} parameters do not match'.format(exp) ## Step 5: Stop Neptune experiment after logging summary neptune.stop() ## Explore Results # Scikit-learn KMeans clustering ## Step 1: Create KMeans object and example data
class RuleFit(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_size: Number of terminal nodes in generated trees. If exp_rand_tree_size=True, this will be the mean number of terminal nodes. sample_fract: fraction of randomly chosen training observations used to produce each tree. FP 2004 (Sec. 2) max_rules: approximate total number of rules generated for fitting. Note that actual number of rules will usually be lower than this due to duplicates. memory_par: scale multiplier (shrinkage factor) applied to each new tree when sequentially induced. FP 2004 (Sec. 2) rfmode: 'regress' for regression or 'classify' for binary classification. lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2 by multiplying the winsorised variable by 0.4/stdev. lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear terms before standardisation. exp_rand_tree_size: If True, each boosted tree will have a different maximum number of terminal nodes based on an exponential distribution about tree_size. (Friedman Sec 3.3) model_type: 'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms random_state: Integer to initialise random objects and provide repeatability. tree_generator: Optional: this object will be used as provided to generate the rules. This will override almost all the other properties above. Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) tol: The tolerance for the optimization for LassoCV or LogisticRegressionCV: if the updates are smaller than `tol`, the optimization code checks the dual gap for optimality and continues until it is smaller than `tol`. max_iter: The maximum number of iterations for LassoCV or LogisticRegressionCV. n_jobs: Number of CPUs to use during the cross validation in LassoCV or LogisticRegressionCV. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, rfmode='regress', lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, model_type='rl', Cs=None, cv=3, tol=0.0001, max_iter=None, n_jobs=None, random_state=None): self.tree_generator = tree_generator self.rfmode = rfmode self.lin_trim_quantile = lin_trim_quantile self.lin_standardise = lin_standardise self.winsorizer = Winsorizer(trim_quantile=lin_trim_quantile) self.friedscale = FriedScale(self.winsorizer) self.stddev = None self.mean = None self.exp_rand_tree_size = exp_rand_tree_size self.max_rules = max_rules self.sample_fract = sample_fract self.max_rules = max_rules self.memory_par = memory_par self.tree_size = tree_size self.random_state = random_state self.model_type = model_type self.cv = cv self.tol = tol # LassoCV default max_iter is 1000 while LogisticRegressionCV 100. self.max_iter = 1000 if 'regress' else 100 self.n_jobs = n_jobs self.Cs = Cs def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided N = X.shape[0] if feature_names is None: self.feature_names = [ 'feature_' + str(x) for x in range(0, X.shape[1]) ] else: self.feature_names = feature_names if 'r' in self.model_type: ## initialise tree generator if self.tree_generator is None: n_estimators_default = int( np.ceil(self.max_rules / self.tree_size)) self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N) if self.rfmode == 'regress': self.tree_generator = GradientBoostingRegressor( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) else: self.tree_generator = GradientBoostingClassifier( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) if self.rfmode == 'regress': if type(self.tree_generator) not in [ GradientBoostingRegressor, RandomForestRegressor ]: raise ValueError( "RuleFit only works with RandomForest and BoostingRegressor" ) else: if type(self.tree_generator) not in [ GradientBoostingClassifier, RandomForestClassifier ]: raise ValueError( "RuleFit only works with RandomForest and BoostingClassifier" ) ## fit tree generator if not self.exp_rand_tree_size: # simply fit with constant tree size self.tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(self.random_state) tree_sizes = np.random.exponential( scale=self.tree_size - 2, size=int(np.ceil(self.max_rules * 2 / self.tree_size))) tree_sizes = np.asarray([ 2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes)) ], dtype=int) i = int(len(tree_sizes) / 4) while np.sum(tree_sizes[0:i]) < self.max_rules: i = i + 1 tree_sizes = tree_sizes[0:i] self.tree_generator.set_params(warm_start=True) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] self.tree_generator.set_params(n_estimators=curr_est_ + 1) self.tree_generator.set_params(max_leaf_nodes=size) random_state_add = self.random_state if self.random_state else 0 self.tree_generator.set_params( random_state=i_size + random_state_add ) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. self.tree_generator.get_params()['n_estimators'] self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 self.tree_generator.set_params(warm_start=False) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance( self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list=tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform(X) ## standardise linear variables if requested (for regression model only) if 'l' in self.model_type: ## standard deviation and mean of winsorized features self.winsorizer.train(X) winsorized_X = self.winsorizer.trim(X) self.stddev = np.std(winsorized_X, axis=0) self.mean = np.mean(winsorized_X, axis=0) if self.lin_standardise: self.friedscale.train(X) X_regn = self.friedscale.scale(X) else: X_regn = X.copy() ## Compile Training data X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: X_concat = np.concatenate((X_concat, X_regn), axis=1) if 'r' in self.model_type: if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) ## fit Lasso if self.rfmode == 'regress': if self.Cs is None: # use defaultshasattr(self.Cs, "__len__"): n_alphas = 100 alphas = None elif hasattr(self.Cs, "__len__"): n_alphas = None alphas = 1. / self.Cs else: n_alphas = self.Cs alphas = None self.lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=self.cv, max_iter=self.max_iter, tol=self.tol, n_jobs=self.n_jobs, random_state=self.random_state) self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_ self.intercept_ = self.lscv.intercept_ else: Cs = 10 if self.Cs is None else self.Cs self.lscv = LogisticRegressionCV(Cs=Cs, cv=self.cv, penalty='l1', max_iter=self.max_iter, tol=self.tol, n_jobs=self.n_jobs, random_state=self.random_state, solver='liblinear') self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_[0] self.intercept_ = self.lscv.intercept_[0] return self def predict(self, X): """Predict outcome for X """ X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len(self.rule_ensemble.rules):] if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict(X_concat) def predict_proba(self, X): """Predict outcome probability for X, if model type supports probability prediction method """ if 'predict_proba' not in dir(self.lscv): error_message = ''' Probability prediction using predict_proba not available for model type {lscv} '''.format(lscv=self.lscv) raise ValueError(error_message) X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len(self.rule_ensemble.rules):] if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict_proba(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self, exclude_zero_coef=False, subregion=None): """Return the estimated rules Parameters ---------- exclude_zero_coef: If True (default), returns only the rules with an estimated coefficient not equalt to zero. subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over subregion of inputs (FP 2004 eq. 30/31/32). Returns ------- rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds the coefficients and 'support' the support of the rule in the training data set (X) """ n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] ## Add coefficients for linear effects for i in range(0, n_features): if self.lin_standardise: coef = self.coef_[i] * self.friedscale.scale_multipliers[i] else: coef = self.coef_[i] if subregion is None: importance = abs(coef) * self.stddev[i] else: subregion = np.array(subregion) importance = sum( abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len(subregion) output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] ## Add rules for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] if subregion is None: importance = abs(coef) * (rule.support * (1 - rule.support))**(1 / 2) else: rkx = rule.transform(subregion) importance = sum( abs(coef) * abs(rkx - rule.support)) / len(subregion) output_rules += [(rule.__str__(), 'rule', coef, rule.support, importance)] rules = pd.DataFrame( output_rules, columns=["rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules