Exemplo n.º 1
0
    def setup(self, config):
        self.n_estimators = int(config.get("n_estimators", 200))
        self.max_depth = config.get("max_depth", 3)
        self.subsample = config.get("subsample", 0.5)
        self.max_features = config.get("max_features", 0.5)
        self.learning_rate = config.get("learning_rate", 0.1)
        self.min_samples_split = config.get("min_samples_split", 2)
        self.min_samples_leaf = config.get("min_samples_leaf", 1)
        self.warm_start = config.get("warm_start", False)

        self.varw = config.get("varslist", ['met_tight_tst_et'])
        self.run_mode = config.get("run_mode", "local")

        from sklearn.ensemble import GradientBoostingClassifier
        model = GradientBoostingClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            subsample=self.subsample,
            max_features=self.max_features,
            learning_rate=self.learning_rate,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            warm_start=self.warm_start)
        print(model.get_params())
        self.model = model
Exemplo n.º 2
0
def train_gb(x_train, y_train, x_test, y_test, x_val, y_val, gb_gridsearch):
    print('Training model gradient boosting with sklearn...')
    cls = GradientBoostingClassifier()
    if gb_gridsearch:
        print('Tuning parameters...')
        grid_params_gb = [{
            'learning_rate': [0.05],
            'n_estimators': [1000],
            'max_depth': [6],
            'subsample': [1],
            'min_samples_split': [2],
            'min_samples_leaf': [1],
            'max_features': ['sqrt'],
            'verbose': [1]
        }]
        gs_gb = GridSearchCV(estimator=cls,
                             param_grid=grid_params_gb,
                             scoring='f1_weighted',
                             cv=10,
                             verbose=10,
                             n_jobs=-1)
        gs_gb.fit(x_train, y_train)
        # Best params
        print('Best params: %s' % gs_gb.best_params_)
        # Best training data r2
        print('Best training accuracy: %.3f' % gs_gb.best_score_)
        model = gs_gb.best_estimator_
        #cls.set_params(**gs_gb.best_params_)
        #model = cls.fit(x_train, y_train)
    else:
        params_gb = {
            'learning_rate': 0.05,
            'n_estimators': 500,
            'max_depth': 3,
            'subsample': 1,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'verbose': 2
        }
        cls.set_params(**params_gb)
        model = cls.fit(x_train, y_train)
    print(print(cls.get_params()))
    print('Test predictions with trained mode...')
    y_pred = model.predict(x_test)
    print('Train predictions with trained mode...')
    y_pred_t = model.predict(x_train)
    print('Validation predictions with trained mode...')
    y_pred_val = model.predict(x_val)
    print('Confussion matrix test:')
    print(confusion_matrix(y_test, y_pred))
    print('Confussion matrix validation:')
    print(confusion_matrix(y_val, y_pred_val))
    print('Prediction accuracy for test: %.3f ' %
          accuracy_score(y_test, y_pred))
    print('Prediction accuracy for train: %.3f ' %
          accuracy_score(y_train, y_pred_t))
    print('Prediction accuracy for validation: %.3f ' %
          accuracy_score(y_val, y_pred_val))
    return model
Exemplo n.º 3
0
def train_gbdt(model=False):
    print('# train_gbdt')

    global log

    params = grid_search_gbdt(True)
    clf = GradientBoostingClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'gbdt'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', subsample: %.1f' % params['subsample']
    log += '\n\n'

    model = train(clf)
    file = open('gdbt-model.pkl', 'wb')
    pickle.dump(model, file)
    file.close()
    print('# train_gbdt end')

    return train(clf)
Exemplo n.º 4
0
def search_bestparam_GradientBoostingClassifier(X, y, df_search_best_param):
    print(f"Search best params for GradientBoostingClassifier ...")
    model = GradientBoostingClassifier()
    print("Supported params", model.get_params())
    param_grid = {
        'n_estimators': [1, 10, 100, 1000],
        'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
        'subsample': [0.1, 0.5, 1.0],
        'max_depth': [1, 3, 5, 10, 20, 50, 100]
    }
    search_bestparam(model, param_grid, X, y, df_search_best_param)
Exemplo n.º 5
0
def test_gradient_precise():
    x_train, y_train, x_test, y_test = load_data()

    train_size = x_train.shape[0]

    best_estimators = 20
    best_learning = 0.2
    best_subsamples = 0.6

    clf = GradientBoostingClassifier(n_estimators=20,
                                     learning_rate=0.2,
                                     subsample=0.6,
                                     max_depth=3,
                                     max_features=3)
    clf.fit(x_train, y_train)

    best_accuracy = clf.score(x_test, y_test)

    print(best_accuracy, best_estimators, best_learning, best_subsamples)

    counter = 0

    for estimators in range(15, 35):
        for learning in range(12, 25):
            learning *= 0.01
            for subsample in range(45, 65, 1):
                subsample *= 0.01
                clf = GradientBoostingClassifier(n_estimators=estimators,
                                                 learning_rate=learning,
                                                 subsample=subsample,
                                                 max_depth=3,
                                                 max_features=3)

                clf.fit(x_train, y_train)

                accuracy = clf.score(x_test, y_test)

                counter += 1

                if counter % 10 == 0:
                    print(str(counter) + ' completed.')

                if accuracy > best_accuracy:
                    print(clf.get_params())
                    print('New best accuracy: ' + str(accuracy))
                    best_accuracy = accuracy
                    # best_estimators = estimators
                    # best_learning = learning
                    # best_subsamples = subsample
                    # print(best_accuracy, best_estimators, best_learning, best_subsamples)

    print(best_accuracy, best_estimators, best_learning, best_subsamples)
Exemplo n.º 6
0
def train():
    X = load_npz("train.npz")[:10]
    print X.shape
    y = np.fromfile("train.label", dtype=np.int64)[:10]
    print y.shape

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, train_size=0.95, random_state=1314)

    gbct = GradientBoostingClassifier(max_depth=2, n_estimators=30, warm_start=True)
    gbct.fit(X_train, y_train)
    model = gbct.get_params()
    pickle.dump(model, "gbct.ml")
def gboost_Classifier():

    clf = GradientBoostingClassifier(random_state=42,verbose = 1,n_estimators=200,max_depth=4)
    print("fitting ...")
    clf.fit(X_train, y_train.values.ravel())
    print("fitted ...")
    result = clf.predict(test)
    # print (result)
    data = pd.read_csv('./data/submit_sample.csv')
    data['predict'] = result
    data.to_csv('./result/gboost_submission.csv', index=False)
    print(clf.score(X_val, y_val))
    # 86.84
    # 输出各个特征项的重要指数
    print(clf.feature_importances_)
    # 输出各个参数 f
    print(clf.get_params())
Exemplo n.º 8
0
def train_gbdt(model=False):
    global log

    params = grid_search_gbdt(True)
    clf = GradientBoostingClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'gbdt'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', n_estimators: %d' % params['n_estimators']
    log += ', max_depth: %d' % params['max_depth']
    log += ', min_samples_split: %d' % params['min_samples_split']
    log += ', min_samples_leaf: %d' % params['min_samples_leaf']
    log += ', subsample: %.1f' % params['subsample']
    log += '\n\n'

    return train(clf)
Exemplo n.º 9
0
def gridsearch_gbc(
    X_train,
    y_train,
    resamp,
    resample_wt=.5,
    learning_rate=.1,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=1,
    max_features=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0,
    loss='deviance',
    criterion='friedman_mse',
):

    X_train, y_train = ms.oversample(X_train, y_train, resamp)

    model = GradientBoostingClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        loss=loss,
        criterion=criterion,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return model.get_params(), recall, precision, f1
Exemplo n.º 10
0
class GBDT_LR(BaseEstimator, ClassifierMixin):
    def __init__(self, gbdt_params: Dict, lr_params: Dict):
        self.gbdt = GradientBoostingClassifier(**gbdt_params)
        self.lr = LogisticRegression(**lr_params)
        self.enc = OneHotEncoder()

    def fit(self, X, y=None):
        self.gbdt.fit(X, y)
        indices = self.gbdt.apply(X).reshape(-1, self.gbdt.n_estimators)
        self.enc.fit(indices)
        self.lr.fit(self.enc.transform(indices), y)
        return self

    def predict_proba(self, X):
        indices = self.gbdt.apply(X).reshape(-1, self.gbdt.n_estimators)
        return self.lr.predict_proba(self.enc.transform(indices))

    def predict(self, X):
        scores = self.predict_proba(X)
        return self.lr.classes_[np.argmax(scores, axis=1).reshape(-1)]

    def get_params(self, deep=True):
        return {"gbdt": self.gbdt.get_params(), "lr": self.lr.get_params()}
print(scores_df)
sns.factorplot('Scores', 'Classifier', data=scores_df, size=6)


# Two best classifiers happened to be Gradient Boost and Logistic Regression. Since Logistic Regression got sligthly lower score and is rather easily overfitted, we will use Gradient Boost. 

# ### Selecting the parameters
# Now that we've chosen the algorithm, we need to select the best parameters for it. There are many options, and sometimes it's almost impossible to know the best set of parameters. That's why we will use Grid Search to test out different options and choose the best ones.
# 
# But first let's take a look at all the possible parameters of Gradient Boosting classifier:

# In[ ]:


g_boost = GradientBoostingClassifier()
g_boost.get_params().keys()


# We will test different options for min_samples_leaf, min_samples_split, max_depth, and loss parameters. I will set n_estimators to 100, but it can be increased since Gradient Boosting algorithms generally don't tend to overfit.

# In[ ]:


from sklearn.model_selection import GridSearchCV

param_grid = {
    'loss': ['deviance', 'exponential'],
    'min_samples_leaf': [2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [100],
    'max_depth': [3, 5, 10, 20]
Exemplo n.º 12
0
if __name__ == "__main__":
	if load_sample:
		header, x_array, label = load_obj(sample_save_file)
	else:
		header, x_array, label, _ = read_features(feature_file)
		save = [header, x_array, label]
		save_obj(save, sample_save_file)

	train_set, test_set, _ = split_train_test(seed, x_array, label, 0.9)

	train_x, train_y = train_set
	# print "size of train_x is", train_x.shape

	gbrt = GradientBoostingClassifier()
	parameter_grid = {'n_estimators' : range(10, 110, 10),
	                 'learning_rate': uniform(loc = 0.01, scale = 0.99),
	                 'max_depth' : range(1, 6, 1),
		             'max_features' : uniform(loc = 0.1, scale = 0.89),
		             'subsample' : uniform(loc = 0.2, scale = 0.79),
		             'min_samples_leaf' : [min_samples_leaf],
		             'min_samples_split' : [min_samples_split]}
	gbrt_gridsearch = grid_search.RandomizedSearchCV(gbrt, \
	                parameter_grid, scoring = auc, \
	                cv=4, n_jobs=2, n_iter=100, verbose=5, refit=False)
	gbrt_gridsearch.fit(train_x, train_y)

	gbrt_best = GradientBoostingClassifier(verbose=2, **gbrt_gridsearch.best_params_)
	gbrt_best.fit(train_x, train_y)
	print "parameters:", gbrt_best.get_params()
	save_obj(gbrt_best, model_save_file)
Exemplo n.º 13
0
dire_flying_courier_time		71132  ----'      acquired by "dire" in the part of battles
dire_first_ward_time			95404  ---------> subject "ward" was not used by "dire"
'''

# filling NA with 0 with accordance with task description											#3
X_train = features.fillna(value = 0, axis = 'columns')
# definition of a target variable														#4
y_train = train_df['radiant_win']
print '\nradiant_win'

print "\n", "GradientBoostingClassifier:"
# using KFold by task description 														#5
kf = KFold(totalLenght, n_folds = 5, shuffle = True, random_state = 1013)
# learning with defualt GradientBoostingClassifier settings
clf = GradientBoostingClassifier(random_state = 1013)
params = clf.get_params()
start_time = datetime.now()
score = cross_val_score(estimator = clf, X=X_train, y=y_train, scoring='roc_auc', cv=kf).mean()
print	'\tn_estimators:', params['n_estimators'],\
		'\tmax_depth:', params['max_depth'],\
		'\tscore:', score,\
		'\ttimeElapsed:', datetime.now() - start_time

#looks like good result for a long-long very long time
'''
	n_estimators: 100	max_depth: 3	score: 0.70661221449	timeElapsed: 0:07:13.214000
'''

# try to found out compromise beetwen good score in execution time
for maxDepth in [1,2,3,5]:
	for treeCount in [10, 20, 30, 40, 50, 80, 100, 200, 500]:
Exemplo n.º 14
0
    return gsearch.best_params_


if __name__ == '__main__':
    train, test = load_data()
    gbdt_model = GradientBoostingClassifier(learning_rate=0.1,
                                            n_estimators=100,
                                            subsample=1.0,
                                            min_samples_split=2,
                                            min_samples_leaf=1,
                                            max_depth=3,
                                            max_features='sqrt')
    model_fit(train, {'n_estimators': 100})

    param_search = {'n_estimators': range(50, 220, 30)}
    param_find = grid_search(gbdt_model, param_search, train)

    gbdt_model.set_params(**param_find)
    param_search = {
        'max_depth': range(3, 14, 2),
        'min_samples_split': range(1, 301, 50)
    }
    param_find = grid_search(gbdt_model, param_search, train)

    gbdt_model.set_params(**param_find)
    param_search = {'min_samples_leaf': range(1, 101, 20)}
    param_find = grid_search(gbdt_model, param_search, train)

    gbdt_model.set_params(**param_find)
    gbdt_model = model_fit(train, gbdt_model.get_params())
Exemplo n.º 15
0
#  - Example: `parameters = {'parameter' : [list of values]}`.
#  - **Note:** Avoid tuning the `max_features` parameter of your learner if that parameter is available!
# - Use `make_scorer` to create an `fbeta_score` scoring object (with $\beta = 0.5$).
# - Perform grid search on the classifier `clf` using the `'scorer'`, and store it in `grid_obj`.
# - Fit the grid search object to the training data (`X_train`, `y_train`), and store it in `grid_fit`.
#
# **Note:** Depending on the algorithm chosen and the parameter list, the following implementation may take some time to run!

# In[14]:

# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer, r2_score, fbeta_score
# TODO: Initialize the classifier
clf = GradientBoostingClassifier(random_state=100)
clf.get_params()
# TODO: Create the parameters list you wish to tune, using a dictionary if needed.
# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {'n_estimators': [50, 100, 200], 'learning_rate': [0.5, 1.5, 2.5]}

# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(fbeta_score, beta=0.5)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_
Exemplo n.º 16
0
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y0[:, 0])
plt.show()

pca_pipe = make_pipeline(StandardScaler(), PCA(10))
X_pca = pca_pipe.fit_transform(X)

y0 = y.reset_index().drop('index', 1).values
X_embedded = TSNE().fit_transform(X_pca)
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y0[:, 0])
plt.show()

#%%
xtr, xte, ytr, yte = train_test_split(X, y, train_size=0.75)

gbc = GradientBoostingClassifier()
gbc.get_params()

bayes = BayesSearchCV()

#%%
xtr, xte, ytr, yte = train_test_split(X, y, train_size=0.75)

model_gnb = GaussianNB()
model_gnb.fit(xtr, ytr)
bayes_score = model_gnb.score(xte, yte)

model_lr = LogisticRegression()
model_lr.fit(xtr, ytr)
logreg_score = model_lr.score(xte, yte)

model_rc = RidgeClassifier()
from sklearn.ensemble import GradientBoostingClassifier

gbc_name = 'GradientBoostingClassifier'

gbc_params_grid = {
    'learning_rate': [0.01],
    # , 0.1, 0.05, 0.5],
    'loss': ['deviance'],
    # , 'exponential'],
    'max_depth': [3, 5],
    # , 10],
    'max_features': [None, 'auto'],
    # , 'sqrt', 'log2'],
    'n_estimators': [200]
    # , 10, 50, 100]
}

gbc = GradientBoostingClassifier(random_state=42)

if __name__ == "__main__":
    print(gbc.get_params())
Exemplo n.º 18
0
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {"max_depth":[10,9,8,7,6,5,4,3,2,1,None],
              "learning_rate":[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09]}

gbrt = GradientBoostingClassifier()

gbrt_cv = RandomizedSearchCV(gbrt,param_dist,cv=5)

gbrt_cv.fit(X_train,y_train)

print("Tuned Decision Tree Parameters:{}".format(gbrt_cv.best_params_))
print("Best score is {}".format(gbrt_cv.best_score_))
#%%
gbrt.get_params().keys()

#%%Hyperparameter tuning --GridsearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

steps = [('GBRT', GradientBoostingClassifier())]

pipeline = Pipeline(steps)

pipeline.fit(X_train,y_train)
print("Test score:{:.2f}".format(pipeline.score(X_test,y_test)))
Exemplo n.º 19
0
    # Print the feature ranking
print("Feature ranking:")    
for f in range(x2.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))  
    # Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(x2.shape[1]), importances[indices],
    color="r", align="center")
plt.xticks(range(x2.shape[1]), indices)
plt.xlim([-1, x2.shape[1]])
plt.show()  

feature_imp(X)
clf.get_params()

param_grid = [
  {'learning_rate': [0.05, 0.1, 0.2, 0.25], 'max_depth': [3,4,5,6], 'min_samples_leaf': [1,2], 'n_estimators': [100,200,300]},
 ]
 
svr = GradientBoostingClassifier() 
from sklearn import grid_search
clf = grid_search.GridSearchCV(svr, param_grid)
clf.fit(x2,training_target)


    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
y_pred_gbdt = gbdt_model.predict_proba(X_valid)[:, 1]
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt)
print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt)

## store the pre-trained gbdt_model
pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb'))

del X_train_gbdt
del y_train_gbdt
gc.collect()

gbdt_model = pickle.load(open(fp_gbdt_model, 'rb'))
#----- data for LR (one-hot encoding with GDBT output) -----#
id_cols = []
for i in range(1, gbdt_model.get_params()['n_estimators']+1):
    id_cols.append('tree'+str(i))
oh_enc = OneHotEncoder(id_cols)

def chunker(seq, size):
    return (seq[pos: pos + size] for pos in range(0, len(seq), size))

## oh_enc fit the train_set
df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8)

for chunk in chunker(df_train_id, 50000):
    oh_enc.fit(chunk)
    
del df_train_id

del X_train_org
# A parameter grid for XGBoost
params = {
    'max_depth': [3],
    'subsample': [0.8],
    'n_estimators': [150],
    'max_leaf_nodes': [80],
}

# In[47]:

roc_auc_scorer = make_scorer(roc_auc_score)

# In[48]:

XG_clf.get_params().keys()

# In[49]:

grid_search = GridSearchCV(XG_clf,
                           params,
                           scoring=roc_auc_scorer,
                           cv=5,
                           return_train_score=True)

grid_search.fit(X_train, y_train)

# In[50]:

grid_search_1 = GridSearchCV(XG_clf,
                             params,
Exemplo n.º 22
0
as the winner of f1 score is the GradientBoostingClassifier, 
and f1 score takes both recall and precision into account,
and it also wins in the accuracy test.
As I think here false negatives and false negatives are of crucial role,
remember high quality wines otnumber low quality by a big margin, 
there are circa 6 times more low quality wines than high

So the double Winning GradientBoostingClassifier is a go to there/
"""

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

X, y = attributes, features

model = GradientBoostingClassifier()
default_params = model.get_params()
print(
    default_params
)  #aby wiedziec co mozna zmieniac, pozniej i tak musialem czytac dokumentacje, aby wiedziec co jest czym..
#%%
#LICZY OKOLO GODZINE, poniezej moje wyniki,
#nie skorzystalem z grid search, ponieważ za duza 'mapa' parametrow
#i szukanie trwaloby wiele lat
parameters = {
    "loss": ["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth": [3, 5, 8],
    "max_features": ["log2", "sqrt"],
    "criterion": ["friedman_mse", "mae"],
Exemplo n.º 23
0
print(f'The f1 score for our train dataset is {f1_score(y_train, y_pred_gbc_train)}')
print(confusion_matrix(y_train, y_pred_gbc_train))

# using only 10 features yield a more accurate model than using all features



"""### Hyperparameter Tuning """

# let us see what our optimal parameters for our random forest are

# Look at parameters used by our current forest
gradient = GradientBoostingClassifier(random_state = 42)
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(gradient.get_params())

#We will try adjusting the following set of hyperparameters:
#n_estimators = number of trees in the foreset
#max_depth = max number of levels in each decision tree
#min_samples_split = min number of data points placed in a node before the node is split
#learning rate: shrinks the contribution of each tree

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Maximum number of levels in tree
max_depth = [5,10,15,20]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# learning rate
Exemplo n.º 24
0
Arquivo: gb.py Projeto: jfraj/khor
class gbClf(BaseModel):

    """Model using random forest classifier."""

    def __init__(self, train_data_fname=None, nrows=None, **kwargs):
        """Initialize the data frame."""
        super(gbClf, self).__init__(train_data_fname, nrows, **kwargs)

    def set_model(self, **kwargs):
        """Set the classifier."""
        verbose = kwargs.get('verbose', 0)
        n_estimators = kwargs.get('n_estimators', 3000)
        max_depth = kwargs.get('max_depth', 3)
        min_samples_leaf = kwargs.get('min_samples_leaf', 1)
        min_samples_split = kwargs.get('min_samples_split', 2)
        max_features = kwargs.get('max_features', None)
        learning_rate = kwargs.get('learning_rate', 0.1)
        subsample = kwargs.get('subsample', 1.0)
        random_state = kwargs.get('random_state', 24)

        self.learner = GradientBoostingClassifier(n_estimators=n_estimators,
                                                  max_depth=max_depth,
                                                  learning_rate=learning_rate,
                                                  min_samples_leaf=min_samples_leaf,
                                                  min_samples_split=min_samples_split,
                                                  max_features=max_features,
                                                  verbose=verbose,
                                                  random_state=random_state)
        print('\n\nGradient Boosting set with parameters:')
        par_dict = self.learner.get_params()
        for ipar in par_dict.keys():
            print('{}: {}'.format(ipar, par_dict[ipar]))
        print('\n\n')

    def fitNscore(self, **kwargs):
        """Fit classifier and produce score and related plots."""
        col2fit = kwargs.get('features')
        # cleaning
        bids_path = kwargs.get('bids_path', 'data/bids.csv')
        if not self.iscleaned:
            print 'Preparing the data...'
            self.prepare_data(bids_path, **kwargs)
        print('columns for fit=\n{}'.format(self.df_train.columns))

        test_size = 0.2  # fraction kept for testing
        rnd_seed = 24  # for reproducibility

        #features_train, features_test, target_train, target_test =\
        #    train_test_split(self.df_train[col2fit].values,
        #                     self.df_train['outcome'].values,
        #                     test_size=test_size,
        #                     random_state=rnd_seed)

        sss = StratifiedShuffleSplit(self.df_train['outcome'].values,
                                     n_iter=1,
                                     test_size=test_size,
                                     random_state=rnd_seed)
        for train_index, test_index in sss:
            features_train = self.df_train[col2fit].values[train_index]
            features_test = self.df_train[col2fit].values[test_index]
            target_train = self.df_train['outcome'].values[train_index]
            target_test = self.df_train['outcome'].values[test_index]

        # Fit Classifier
        self.fitModel(features_train, target_train, **kwargs)

        # Predict on the rest of the sample
        print('\nPredicting...')
        predictions = self.learner.predict(features_test)
        probas = self.learner.predict_proba(features_test)

        # Feature index ordered by importance
        ord_idx = np.argsort(self.learner.feature_importances_)
        print("Feature ranking:")
        for ifeaturindex in ord_idx[::-1]:
            print('{0} \t: {1}'.format(col2fit[ifeaturindex],
                                       round(self.learner.feature_importances_[ifeaturindex], 2)))

        # Score
        print('(Self) Score={}'.format(self.learner.score(features_test, target_test)))

        # Plots

        # Feature importances
        maxfeat2show = 30 # number of features to show in plots
        importances = self.learner.feature_importances_
        #std = np.std([tree.feature_importances_ for tree in self.learner.estimators_],axis=0)
        indices = np.argsort(importances)[::-1]
        indices = indices[:min(maxfeat2show, len(indices))]  # truncate if > maxfeat2show
        ordered_names = [col2fit[i] for i in indices]

        fig_import = plt.figure(figsize=(10, 10))
        plt.title("Feature importances, GB")
        #plt.barh(range(len(indices)), importances[indices],
        #        color="b", xerr=std[indices], align="center",ecolor='r')
        plt.barh(range(len(indices)), importances[indices],
                 color="b", align="center")
        plt.yticks(range(len(indices)), ordered_names)
        plt.ylim([-1, len(indices)])
        plt.ylim(plt.ylim()[::-1])
        plt.subplots_adjust(left=0.22)
        fig_import.show()

        # confusion matrix
        cm = confusion_matrix(target_test.astype(int), predictions.astype(int))
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm_normalized = np.clip(cm_normalized, 0.0, 0.5)

        fig_cm = plt.figure()
        ax_cm = fig_cm.add_subplot(1,1,1)
        im_cm = ax_cm.imshow(cm_normalized, interpolation='nearest')
        plt.title('Normalized confusion mtx, GB')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        fig_cm.colorbar(im_cm)
        fig_cm.show()

        # ROC curve
        # This ones seems to reflect better the LB score
        #false_pos, true_pos, thr = roc_curve(target_test, predictions)
        false_pos, true_pos, thr = roc_curve(target_test, probas[:, 1])
        roc_auc = auc(false_pos, true_pos)
        fig_roc = plt.figure()
        plt.plot(false_pos, true_pos,
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")
        fig_roc.show()
        print('ROC_AUC = {}'.format(roc_auc))


        raw_input('press enter when finished...')
Exemplo n.º 25
0
def main():


    #
    n_estimators = int(sys.argv[1])
    max_depth = int(sys.argv[2])
    learning_rate = float(sys.argv[3])


    # n_estimators = 100
    # max_depth = 3
    # learning_rate = 0.3

    cls = GradientBoostingClassifier(n_estimators= n_estimators, max_depth= max_depth, loss='deviance',  learning_rate= learning_rate ,
                                     min_samples_split = 10,
                                     min_samples_leaf= 10,
                                     subsample=0.8,verbose=2)
    cls.fit(X_train,y_train, monitor=Monitor(X_vldt,y_vldt))
    # cls.fit(X_train,y_train)



    print 'vldt auc = ' , roc_auc_score(y_vldt,  cls.predict_proba(X_vldt)[:,1] ),  ', vars = ' , cls.get_params()

    with open('out.csv','w') as f:
        f.write('instance_id,prob\n')

        tests = get_files(test_file)
        for t in tests:
            print t
            X_test , y_test = load_svmlight_file(t)
            proba = cls.predict_proba(X_test)[:,1]

            for i ,v in enumerate(y_test):
                f.write('{0},{1}\n'.format(int(v), proba[i]))
    subprocess.check_call('cat out.csv | sort -t, -k1 -n >out.sorted.csv',shell=True)

    subprocess.check_call('cp out.sorted.csv out.sorted.{0}-{1}-{2}.csv'.format(n_estimators, max_depth, learning_rate),shell=True)
Exemplo n.º 26
0
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'criterion': criterion,
    'learning_rate': learning_rate,
    'loss': loss,
    'max_leaf_nodes': max_leaf_nodes,
}

RTClassifier = RandomForestClassifier(random_state=42)
pprint(RTClassifier.get_params())

xgbooster = GradientBoostingClassifier(random_state=42)
pprint(xgbooster.get_params())

hyperparameters(classifier=xgbooster,
                X=X,
                y=y,
                grid=xgboost_grid,
                iterations=10000,
                fold=4)

hyperparameters(classifier=RTClassifier,
                X=X,
                y=y,
                grid=random_grid,
                iterations=10000,
                fold=4)
Exemplo n.º 27
0
classifier = GradientBoostingClassifier(n_estimators=500, learning_rate=0.075)
#classifier.fit(X_train, y_train)
#y_pred = classifier.predict(X_val)
#
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import accuracy_score
##y_pred = classifier.predict(X_val)
#cm = confusion_matrix(y_val, y_pred)
#accuracy = accuracy_score(y_val, y_pred)

#Grid Search
#from sklearn.model_selection import GridSearchCV
#parameters = {
#        'n_estimators' : [200, 400]
#}
#
#grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10)
#grid_search = grid_search.fit(X_train, y_train)
#best_score = grid_search.best_score_
#best_parameters = grid_search.best_params_

model_performance = pd.DataFrame()

from sklearn.model_selection import cross_val_score
kcv = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
model_performance.append([kcv])
mean_kcv = kcv.mean()
std_kcv = kcv.std()

classifier.get_params()
Exemplo n.º 28
0
                                                    y,
                                                    test_size=0.33,
                                                    random_state=20)

boostc = GradientBoostingClassifier(n_estimators=30, max_depth=6)
s1 = time.time()
boostc.fit(X_train, y_train)
e1 = time.time()
# s2=time.time()
# y_prd=boostc.predict(X_test)
# e2=time.time()
t1 = e1 - s1
#t2=e2-s2
y_test = np.array(y_test)
#y_prd = np.array(y_prd)
print(boostc.get_params())
#print('Test Accuracy: %.8f' % accuracy_score(y_test,y_prd))
print("Training time: ", t1)
#print("Testing time: ",t2)
plot_learning_curve(boostc,
                    'Learning Curve for boosting',
                    X_train,
                    y_train, (0, 1.01),
                    cv=5)

boostc1 = GradientBoostingClassifier()
plot_validation_curve(X_train, y_train, boostc1, 'k1')
plot_validation_curve(X_train, y_train, boostc1, 'k2')
#plot_validation_curve(X_train,y_train,boostc1,'k3')
# plot_validation_curve(X_train,y_train,clf1,'dtree3')
def train_test_model(df_model,
                     predictor='max_dt_3days',
                     model_type='quantile_discharge',
                     loss='far'):

    performance_scores = pd.DataFrame()

    # loop over districts
    for district in df_model.district.unique():
        df_district = df_model[df_model['district'] == district]
        if df_district.flood.nunique() < 2:
            continue

        if model_type == 'quantile_discharge':
            performance_model = pd.DataFrame(
                columns=['parameters', 'pod', 'far', 'pofd', 'csi'])

            # loop over stations and test all possible quantiles
            for station in df_district['station'].unique():
                df_station = df_district[df_district['station'] == station]
                extreme_dis = df_station.set_index('time')[predictor].groupby(
                    pd.Grouper(freq='6M')).max()

                for q in range(50, 100):
                    threshold = extreme_dis.quantile(q / 100.)
                    df_station['predictions'] = np.where(
                        (df_station[predictor] >= threshold), 1, 0)

                    perf = df_station.groupby(['district', 'station']).\
                        apply(lambda row: calc_performance_scores(row['flood'], row['predictions']))
                    perf['parameters'] = str((station, str(q)))
                    perf['district'] = district
                    performance_model = performance_model.append(
                        perf, ignore_index=True)

            # find the couple (station, quantile) that minimizes loss function
            best_performance = performance_model.iloc[
                performance_model[loss].idxmin]
            # save performance
            performance_scores = performance_scores.append(best_performance,
                                                           ignore_index=True)

        elif model_type == 'bdt_discharge':
            # prepare training data
            X, y = [], []
            df_ordered = df_district.groupby(['station',
                                              'time'])[predictor].max()
            for time in df_district.time.unique():
                X.append([
                    df_ordered.loc[(station, time)]
                    for station in df_district.station.unique()
                ])
                y.append(df_district[df_district['time'] == time]
                         ['flood'].values[0])
            # train and predict
            model = GradientBoostingClassifier(max_features='auto',
                                               loss='exponential')
            sample_weight = [len(y) / y.count(i) for i in y]
            model.fit(X, y, sample_weight)
            predictions = model.predict(X)
            # save performance
            best_performance = calc_performance_scores(pd.Series(y),
                                                       pd.Series(predictions))
            best_performance['parameters'] = str(model.get_params())
            best_performance['district'] = district
            performance_scores = performance_scores.append(best_performance,
                                                           ignore_index=True)

    return performance_scores
Exemplo n.º 30
0
y_pred_gbdt = gbdt_model.predict_proba(X_valid)[:, 1]
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt)
print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt)

## store the pre-trained gbdt_model
pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb'))

del X_train_gbdt
del y_train_gbdt
gc.collect()

gbdt_model = pickle.load(open(fp_gbdt_model, 'rb'))
#----- data for LR (one-hot encoding with GDBT output) -----#
id_cols = []
for i in range(1, gbdt_model.get_params()['n_estimators'] + 1):
    id_cols.append('tree' + str(i))
oh_enc = OneHotEncoder(id_cols)


def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


## oh_enc fit the train_set
df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0],
                           columns=id_cols,
                           dtype=np.int8)

for chunk in chunker(df_train_id, 50000):
    oh_enc.fit(chunk)
class RuleFitCustom(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_size:      Number of terminal nodes in generated trees. If exp_rand_tree_size=True, 
                        this will be the mean number of terminal nodes.
        sample_fract:   fraction of randomly chosen training observations used to produce each tree. 
                        FP 2004 (Sec. 2)
        max_rules:      approximate total number of rules generated for fitting. Note that actual
                        number of rules will usually be lower than this due to duplicates.
        memory_par:     scale multiplier (shrinkage factor) applied to each new tree when 
                        sequentially induced. FP 2004 (Sec. 2)
        rfmode:         'regress' for regression or 'classify' for binary classification.
        lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2
                        by multiplying the winsorised variable by 0.4/stdev.
        lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear 
                        terms before standardisation.
        exp_rand_tree_size: If True, each boosted tree will have a different maximum number of 
                        terminal nodes based on an exponential distribution about tree_size. 
                        (Friedman Sec 3.3)
        model_type:     'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms
        random_state:   Integer to initialise random objects and provide repeatability.
        tree_generator: Optional: this object will be used as provided to generate the rules. 
                        This will override almost all the other properties above. 
                        Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
                 memory_par=0.01,
                 tree_generator=None,
                 rfmode='regress',
                 lin_trim_quantile=0.025,
                 lin_standardise=True,
                 exp_rand_tree_size=True,
                 model_type='rl',
                 Cs=None,
                 cv=3,
                 random_state=None,
                 simple_rules=False):
        self.tree_generator = tree_generator
        self.rfmode = rfmode
        self.lin_trim_quantile = lin_trim_quantile
        self.lin_standardise = lin_standardise
        self.friedscale = FriedScale(trim_quantile=lin_trim_quantile)
        self.exp_rand_tree_size = exp_rand_tree_size
        self.max_rules = max_rules
        self.sample_fract = sample_fract
        self.memory_par = memory_par
        self.tree_size = tree_size
        self.random_state = random_state
        self.model_type = model_type
        self.cv = cv
        self.Cs = Cs
        self.simple_rules = simple_rules  # TODO mettre en param global, on veut pouvoir faire dans le fit pour garder les mêmes nodes et comparaison plus facile du coup

    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        N = X.shape[0]
        if feature_names is None:
            self.feature_names = [
                'feature_' + str(x) for x in range(0, X.shape[1])
            ]
        else:
            self.feature_names = feature_names
        if 'r' in self.model_type:
            ## initialise tree generator
            if self.tree_generator is None:
                n_estimators_default = int(
                    np.ceil(self.max_rules / self.tree_size))
                self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N)
                if self.rfmode == 'regress':
                    self.tree_generator = GradientBoostingRegressor(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)
                else:
                    self.tree_generator = GradientBoostingClassifier(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)

            if self.rfmode == 'regress':
                if type(self.tree_generator) not in [
                        GradientBoostingRegressor, RandomForestRegressor
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingRegressor"
                    )
            else:
                if type(self.tree_generator) not in [
                        GradientBoostingClassifier, RandomForestClassifier
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingClassifier"
                    )

            ## fit tree generator
            if not self.exp_rand_tree_size:  # simply fit with constant tree size
                self.tree_generator.fit(X, y)
            else:  # randomise tree size as per Friedman 2005 Sec 3.3
                np.random.seed(self.random_state)
                tree_sizes = np.random.exponential(
                    scale=self.tree_size - 2,
                    size=int(np.ceil(self.max_rules * 2 / self.tree_size)))
                tree_sizes = np.asarray([
                    2 + np.floor(tree_sizes[i_])
                    for i_ in np.arange(len(tree_sizes))
                ],
                                        dtype=int)
                i = int(len(tree_sizes) / 4)
                while np.sum(tree_sizes[0:i]) < self.max_rules:
                    i = i + 1
                tree_sizes = tree_sizes[0:i]
                self.tree_generator.set_params(warm_start=True)
                curr_est_ = 0
                for i_size in np.arange(len(tree_sizes)):
                    size = tree_sizes[i_size]
                    self.tree_generator.set_params(n_estimators=curr_est_ + 1)
                    self.tree_generator.set_params(max_leaf_nodes=size)
                    random_state_add = self.random_state if self.random_state else 0
                    self.tree_generator.set_params(
                        random_state=i_size + random_state_add
                    )  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
                    self.tree_generator.get_params()['n_estimators']
                    self.tree_generator.fit(np.copy(X, order='C'),
                                            np.copy(y, order='C'))
                    curr_est_ = curr_est_ + 1
                self.tree_generator.set_params(warm_start=False)
            tree_list = self.tree_generator.estimators_
            if isinstance(self.tree_generator,
                          RandomForestRegressor) or isinstance(
                              self.tree_generator, RandomForestClassifier):
                tree_list = [[x] for x in self.tree_generator.estimators_]

            ## extract rules
            self.rule_ensemble = RuleEnsemble(tree_list=tree_list,
                                              feature_names=self.feature_names)

            ## concatenate original features and rules
            X_rules = self.rule_ensemble.transform(
                X, weigh_rules=self.simple_rules)
            self.X_rules = X_rules

            #if self.simple_rules:
            #    for i in range(0, X_rules.shape[1]):
            #        X_rules[:, i] = X_rules[:, i]/len(self.rule_ensemble.rules[i].conditions)

        ## standardise linear variables if requested (for regression model only)
        if 'l' in self.model_type:
            if self.lin_standardise:
                self.friedscale.train(X)
                X_regn = self.friedscale.scale(X)
            else:
                X_regn = X.copy()

        ## Compile Training data
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            X_concat = np.concatenate((X_concat, X_regn), axis=1)
        if 'r' in self.model_type:
            if X_rules.shape[0] > 0:
                X_concat = np.concatenate((X_concat, X_rules), axis=1)
        self.X_concat = X_concat

        ## fit Lasso
        if self.rfmode == 'regress':
            if self.Cs is None:  # use defaultshasattr(self.Cs, "__len__"):
                n_alphas = 100
                alphas = None
            elif hasattr(self.Cs, "__len__"):
                n_alphas = None
                alphas = 1. / self.Cs
            else:
                n_alphas = self.Cs
                alphas = None
            self.lscv = LassoCV(n_alphas=n_alphas,
                                alphas=alphas,
                                cv=self.cv,
                                random_state=self.random_state)
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_
            self.intercept_ = self.lscv.intercept_
        else:
            Cs = 10 if self.Cs is None else self.Cs
            self.lscv = LogisticRegressionCV(Cs=Cs,
                                             cv=self.cv,
                                             penalty='l1',
                                             random_state=self.random_state,
                                             solver='liblinear')
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_[0]
            self.intercept_ = self.lscv.intercept_[0]

        return self

    def predict(self, X):
        """Predict outcome for X

        """
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(
                self.rule_ensemble.rules
            ):]  # bug correction. upstreamed at https://github.com/christophM/rulefit/issues/23
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(
                    X, coefs=rule_coefs, weigh_rules=self.simple_rules)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self, exclude_zero_coef=False):
        """Return the estimated rules

        Parameters
        ----------
        exclude_zero_coef: If True (default), returns only the rules with an estimated
                           coefficient not equalt to  zero.

        Returns
        -------
        rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
               the coefficients and 'support' the support of the rule in the training
               data set (X)
        """

        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        ## Add coefficients for linear effects
        for i in range(0, n_features):
            if self.lin_standardise:
                coef = self.coef_[i] * self.friedscale.scale_multipliers[i]
            else:
                coef = self.coef_[i]
            output_rules += [(self.feature_names[i], 'linear', coef, 1, 0)
                             ]  # TODO REMOVE, pour debug
        ## Add rules
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]
            output_rules += [(rule.__str__(), 'rule', coef, rule.support, i)
                             ]  # TODO REMOVE, pour debug
        rules = pd.DataFrame(
            output_rules,
            columns=["rule", "type", "coef", "support",
                     "rule_number"])  # TODO REMOVE, pour debug
        if exclude_zero_coef:
            rules = rules.ix[rules.coef != 0]
        return rules

    def rules_complexity(self):
        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)

        res = 0
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]
            if (coef != 0):
                res += len(rule.conditions)

        return res
Exemplo n.º 32
0
        plt.show()

    # Gradient Boost Classifier
    print("\n")
    print("Gradient Boost")
    gb = GradientBoostingClassifier(loss='exponential',
                                    max_depth=3,
                                    learning_rate=0.01,
                                    n_estimators=100,
                                    subsample=1.0,
                                    criterion='mae',
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    max_features='auto',
                                    random_state=5)
    print(gb.get_params())
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    print_cm_stats(y_pred, y_test)

    # ROC curve
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    plot_roc_curve(X_test, y_test, gb, ax)
    ax.set_title("Gradient Boost")
    plt.show()

    # Feature importances
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    plot_feature_importances(df.columns, gb, ax)
    ax.set_title("Gradient Boost")
    plt.show()
log_classifier_summary(gbc, X_train, X_test, y_train, y_test)

# tests
exp = neptune.get_experiment()

# check logs
correct_logs_set = {'charts_sklearn'}
for name in ['precision', 'recall', 'fbeta_score', 'support']:
    for i in range(10):
        correct_logs_set.add('{}_class_{}_test_sklearn'.format(name, i))
from_exp_logs = set(exp.get_logs().keys())
assert correct_logs_set == from_exp_logs, '{} - incorrect logs'.format(exp)

# check sklearn parameters
assert set(exp.get_properties().keys()) == set(
    gbc.get_params().keys()), '{} parameters do not match'.format(exp)

# check neptune parameters
assert set(exp.get_parameters().keys()) == set(
    parameters.keys()), '{} parameters do not match'.format(exp)

## Step 5: Stop Neptune experiment after logging summary

neptune.stop()

## Explore Results

# Scikit-learn KMeans clustering

## Step 1: Create KMeans object and example data
Exemplo n.º 34
0
class RuleFit(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_size:      Number of terminal nodes in generated trees. If exp_rand_tree_size=True,
                        this will be the mean number of terminal nodes.
        sample_fract:   fraction of randomly chosen training observations used to produce each tree.
                        FP 2004 (Sec. 2)
        max_rules:      approximate total number of rules generated for fitting. Note that actual
                        number of rules will usually be lower than this due to duplicates.
        memory_par:     scale multiplier (shrinkage factor) applied to each new tree when
                        sequentially induced. FP 2004 (Sec. 2)
        rfmode:         'regress' for regression or 'classify' for binary classification.
        lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2
                        by multiplying the winsorised variable by 0.4/stdev.
        lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear
                        terms before standardisation.
        exp_rand_tree_size: If True, each boosted tree will have a different maximum number of
                        terminal nodes based on an exponential distribution about tree_size.
                        (Friedman Sec 3.3)
        model_type:     'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms
        random_state:   Integer to initialise random objects and provide repeatability.
        tree_generator: Optional: this object will be used as provided to generate the rules.
                        This will override almost all the other properties above.
                        Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)
        tol:            The tolerance for the optimization for LassoCV or LogisticRegressionCV:
                        if the updates are smaller than `tol`, the optimization code checks the dual
                        gap for optimality and continues until it is smaller than `tol`.
        max_iter:       The maximum number of iterations for LassoCV or LogisticRegressionCV.
        n_jobs:         Number of CPUs to use during the cross validation in LassoCV or
                        LogisticRegressionCV. None means 1 unless in a joblib.parallel_backend
                        context. -1 means using all processors.

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
                 memory_par=0.01,
                 tree_generator=None,
                 rfmode='regress',
                 lin_trim_quantile=0.025,
                 lin_standardise=True,
                 exp_rand_tree_size=True,
                 model_type='rl',
                 Cs=None,
                 cv=3,
                 tol=0.0001,
                 max_iter=None,
                 n_jobs=None,
                 random_state=None):
        self.tree_generator = tree_generator
        self.rfmode = rfmode
        self.lin_trim_quantile = lin_trim_quantile
        self.lin_standardise = lin_standardise
        self.winsorizer = Winsorizer(trim_quantile=lin_trim_quantile)
        self.friedscale = FriedScale(self.winsorizer)
        self.stddev = None
        self.mean = None
        self.exp_rand_tree_size = exp_rand_tree_size
        self.max_rules = max_rules
        self.sample_fract = sample_fract
        self.max_rules = max_rules
        self.memory_par = memory_par
        self.tree_size = tree_size
        self.random_state = random_state
        self.model_type = model_type
        self.cv = cv
        self.tol = tol
        # LassoCV default max_iter is 1000 while LogisticRegressionCV 100.
        self.max_iter = 1000 if 'regress' else 100
        self.n_jobs = n_jobs
        self.Cs = Cs

    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        N = X.shape[0]
        if feature_names is None:
            self.feature_names = [
                'feature_' + str(x) for x in range(0, X.shape[1])
            ]
        else:
            self.feature_names = feature_names
        if 'r' in self.model_type:
            ## initialise tree generator
            if self.tree_generator is None:
                n_estimators_default = int(
                    np.ceil(self.max_rules / self.tree_size))
                self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N)
                if self.rfmode == 'regress':
                    self.tree_generator = GradientBoostingRegressor(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)
                else:
                    self.tree_generator = GradientBoostingClassifier(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)

            if self.rfmode == 'regress':
                if type(self.tree_generator) not in [
                        GradientBoostingRegressor, RandomForestRegressor
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingRegressor"
                    )
            else:
                if type(self.tree_generator) not in [
                        GradientBoostingClassifier, RandomForestClassifier
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingClassifier"
                    )

            ## fit tree generator
            if not self.exp_rand_tree_size:  # simply fit with constant tree size
                self.tree_generator.fit(X, y)
            else:  # randomise tree size as per Friedman 2005 Sec 3.3
                np.random.seed(self.random_state)
                tree_sizes = np.random.exponential(
                    scale=self.tree_size - 2,
                    size=int(np.ceil(self.max_rules * 2 / self.tree_size)))
                tree_sizes = np.asarray([
                    2 + np.floor(tree_sizes[i_])
                    for i_ in np.arange(len(tree_sizes))
                ],
                                        dtype=int)
                i = int(len(tree_sizes) / 4)
                while np.sum(tree_sizes[0:i]) < self.max_rules:
                    i = i + 1
                tree_sizes = tree_sizes[0:i]
                self.tree_generator.set_params(warm_start=True)
                curr_est_ = 0
                for i_size in np.arange(len(tree_sizes)):
                    size = tree_sizes[i_size]
                    self.tree_generator.set_params(n_estimators=curr_est_ + 1)
                    self.tree_generator.set_params(max_leaf_nodes=size)
                    random_state_add = self.random_state if self.random_state else 0
                    self.tree_generator.set_params(
                        random_state=i_size + random_state_add
                    )  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
                    self.tree_generator.get_params()['n_estimators']
                    self.tree_generator.fit(np.copy(X, order='C'),
                                            np.copy(y, order='C'))
                    curr_est_ = curr_est_ + 1
                self.tree_generator.set_params(warm_start=False)
            tree_list = self.tree_generator.estimators_
            if isinstance(self.tree_generator,
                          RandomForestRegressor) or isinstance(
                              self.tree_generator, RandomForestClassifier):
                tree_list = [[x] for x in self.tree_generator.estimators_]

            ## extract rules
            self.rule_ensemble = RuleEnsemble(tree_list=tree_list,
                                              feature_names=self.feature_names)

            ## concatenate original features and rules
            X_rules = self.rule_ensemble.transform(X)

        ## standardise linear variables if requested (for regression model only)
        if 'l' in self.model_type:

            ## standard deviation and mean of winsorized features
            self.winsorizer.train(X)
            winsorized_X = self.winsorizer.trim(X)
            self.stddev = np.std(winsorized_X, axis=0)
            self.mean = np.mean(winsorized_X, axis=0)

            if self.lin_standardise:
                self.friedscale.train(X)
                X_regn = self.friedscale.scale(X)
            else:
                X_regn = X.copy()

        ## Compile Training data
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            X_concat = np.concatenate((X_concat, X_regn), axis=1)
        if 'r' in self.model_type:
            if X_rules.shape[0] > 0:
                X_concat = np.concatenate((X_concat, X_rules), axis=1)

        ## fit Lasso
        if self.rfmode == 'regress':
            if self.Cs is None:  # use defaultshasattr(self.Cs, "__len__"):
                n_alphas = 100
                alphas = None
            elif hasattr(self.Cs, "__len__"):
                n_alphas = None
                alphas = 1. / self.Cs
            else:
                n_alphas = self.Cs
                alphas = None
            self.lscv = LassoCV(n_alphas=n_alphas,
                                alphas=alphas,
                                cv=self.cv,
                                max_iter=self.max_iter,
                                tol=self.tol,
                                n_jobs=self.n_jobs,
                                random_state=self.random_state)
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_
            self.intercept_ = self.lscv.intercept_
        else:
            Cs = 10 if self.Cs is None else self.Cs
            self.lscv = LogisticRegressionCV(Cs=Cs,
                                             cv=self.cv,
                                             penalty='l1',
                                             max_iter=self.max_iter,
                                             tol=self.tol,
                                             n_jobs=self.n_jobs,
                                             random_state=self.random_state,
                                             solver='liblinear')
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_[0]
            self.intercept_ = self.lscv.intercept_[0]

        return self

    def predict(self, X):
        """Predict outcome for X

        """
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(self.rule_ensemble.rules):]
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict(X_concat)

    def predict_proba(self, X):
        """Predict outcome probability for X, if model type supports probability prediction method

        """

        if 'predict_proba' not in dir(self.lscv):

            error_message = '''
            Probability prediction using predict_proba not available for
            model type {lscv}
            '''.format(lscv=self.lscv)
            raise ValueError(error_message)

        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(self.rule_ensemble.rules):]
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict_proba(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self, exclude_zero_coef=False, subregion=None):
        """Return the estimated rules

        Parameters
        ----------
        exclude_zero_coef: If True (default), returns only the rules with an estimated
                           coefficient not equalt to  zero.

        subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over
                           subregion of inputs (FP 2004 eq. 30/31/32).

        Returns
        -------
        rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
               the coefficients and 'support' the support of the rule in the training
               data set (X)
        """

        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        ## Add coefficients for linear effects
        for i in range(0, n_features):
            if self.lin_standardise:
                coef = self.coef_[i] * self.friedscale.scale_multipliers[i]
            else:
                coef = self.coef_[i]
            if subregion is None:
                importance = abs(coef) * self.stddev[i]
            else:
                subregion = np.array(subregion)
                importance = sum(
                    abs(coef) *
                    abs([x[i] for x in self.winsorizer.trim(subregion)] -
                        self.mean[i])) / len(subregion)
            output_rules += [(self.feature_names[i], 'linear', coef, 1,
                              importance)]

        ## Add rules
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]

            if subregion is None:
                importance = abs(coef) * (rule.support *
                                          (1 - rule.support))**(1 / 2)
            else:
                rkx = rule.transform(subregion)
                importance = sum(
                    abs(coef) * abs(rkx - rule.support)) / len(subregion)

            output_rules += [(rule.__str__(), 'rule', coef, rule.support,
                              importance)]
        rules = pd.DataFrame(
            output_rules,
            columns=["rule", "type", "coef", "support", "importance"])
        if exclude_zero_coef:
            rules = rules.ix[rules.coef != 0]
        return rules