test = pd.read_csv('./../data/testDatafinalData.csv')
x_test = test.drop(['id'], axis=1)

min_samples = range(2, 19, 4)
max_depth = range(1, 20, 4)
max_feature = ['auto', 'sqrt', 'log2']

#roc_auc
parameters = {
    'min_samples_split': min_samples,
    'max_depth': max_depth,
    'max_features': max_feature
}

clf_tree = tree.DecisionTreeClassifier()
model = GridSearchCV(clf_tree, parameters, scoring='roc_auc')
#model = tree.DecisionTreeClassifier()
model.fit(x, y)
print model.best_params_
print model.grid_scores_

score_sqrt = []
score_auto = []
score_log2 = []
for a in model.grid_scores_:
    if (a[0]['max_features'] == 'log2'):
        score_log2.append(a[1])

for a in model.grid_scores_:
    if (a[0]['max_features'] == 'auto'):
        score_auto.append(a[1])
Пример #2
0
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.001, 0.0001],
                    'C': [1,10,100]},
                    {'kernel': ['linear'], 'C': [1,10,100]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
Пример #3
0
### parameter tuning with grid search
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

select = SelectKBest()
steps = [('feature_selection', select), ('random_forest', clf)]
parameters = dict(feature_selection__k=[10, 15, 'all'],
                  random_forest__n_estimators=[5, 10, 15, 20],
                  random_forest__criterion=['gini', 'entropy'],
                  random_forest__max_features=[1, 2, 3, 4],
                  random_forest__min_samples_split=[2, 3, 4, 5])
pipeline = Pipeline(steps)
cv = StratifiedKFold(labels, n_folds=10)
grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv)
grid_search.fit(features, labels)
print 'Best score: {}'.format(grid_search.best_score_)
print 'best parameters: {}'.format(grid_search.best_params_)
'''
Best score: 0.861
best parameters: {'random_forest__min_samples_split': 4, 'random_forest__n_estimators': 20, 'feature_selection__k': 'all', 'random_forest__max_features': 4, 'random_forest__criterion': 'entropy'}
'''
'''
Finalize the model
'''
### plot the cross-validation scores
clf = RandomForestClassifier(max_features=4,
                             min_samples_split=4,
                             criterion='entropy',
                             n_estimators=20)
Пример #4
0
def reg_run():

    parent_data_folder = ''
    import getpass
    user = getpass.getuser()
    if user == 'igor':
        parent_data_folder = '/home/igor/ML/data_1/'
    elif user == 'pesici':
        parent_data_folder = os.environ['HOME'] + '/'
    else:
        os.path.realpath(__file__)

    simple = False
    train_data_path = ''
    test_data_path = ''
    X_file_search = ''
    X_nd_file_search = ''
    Y_file_search = ''
    vSize = 8
    if not simple:
        train_data_path = parent_data_folder + 'set_train_gray_matter_maps/'
        test_data_path = parent_data_folder + 'set_test_gray_matter_maps/'
        X_file_search = 'X_compact' + str(vSize) + '.mtx'
        X_nd_file_search = 'X_compact' + str(vSize) + '.npy'
        Y_file_search = 'y' + str(vSize) + '.mtx.npy'
    else:
        train_data_path = parent_data_folder + 'set_train_simple/'
        test_data_path = parent_data_folder + 'set_test_simple/'
        X_file_search = 'X_simple.mtx'
        X_nd_file_search = 'X_simple.npy'
        Y_file_search = 'y_simple.mtx.npy'

    idxSlice = 85
    targets_file = parent_data_folder + 'targets.csv'
    X_file = parent_data_folder + X_file_search
    X_nd_file = parent_data_folder + X_nd_file_search

    Y_file = parent_data_folder + Y_file_search
    targets = [0]
    with open(targets_file, 'rb') as csvfile:
        for line in csvfile:
            targets.append(int(line))

    iters = 300
    ys = []
    X = []
    method = 'SVR pipeline'
    if (method == 'SVR pipeline'
            or True) and (X_nd_file_search in os.listdir(parent_data_folder)
                          and Y_file_search in os.listdir(parent_data_folder)):
        print 'Existing X and Y ARRAY files were found!'
        ys = np.load(Y_file)
        X = np.load(X_nd_file)

    #elif method <> 'SVR pipeline' and (X_file_search in os.listdir(parent_data_folder) and Y_file_search in os.listdir(parent_data_folder)):
    #    print 'Existing X and Y files were found!'
    #    ys = np.load(Y_file)
    #    X = io.mmread(X_file)

    else:
        for f in os.listdir(train_data_path):
            if ('train_' in f and f.endswith('.mtx')
                    and not simple) or ('train_' in f and f.endswith('.npy')
                                        and simple):
                iters = iters - 1
                #print dirpath+f
                pic_id = int(f.split('.')[0].split('_')[1])
                #segmented_img = np.load(train_data_path+'/'+f)
                #segmented_img = None
                if not simple:
                    img = io.mmread(train_data_path + '/' + f)
                    img = img.toarray()
                    img = np.reshape(img, (176, 208, 176))
                    # sum up voxels of size 8x8x8 or 4x4x4 of img

                    x = smooth_img(img, vSize)
                    if X == []:
                        X = x
                    else:
                        X = np.vstack((X, x))
                    y = targets[pic_id]
                    ys.append(y)
                else:
                    segmented_img = np.load(train_data_path + '/' + f)
                    x = segmented_img
                    if X == []:
                        X = x
                    else:
                        X = np.vstack((X, x))
                    y = targets[pic_id]
                    ys.append(y)
                #new_size = segmented_img.shape[0] *segmented_img.shape[1]*segmented_img.shape[2]
                #x = np.reshape(segmented_img, (new_size,1))

                #x = x.astype(int)
                #x = sparse.coo_matrix(x)

                #nda_show(segmented_img[:,:,idxSlice], title=str(y))
                gc.collect()
                print 'iters = ', iters
                if iters < 0:
                    break
        print 'Saving X...'
        if not simple:
            np.save(X_nd_file, X)
        else:
            np.save(X_nd_file, X)
        #io.mmwrite(X_file, X)
        np.save(Y_file, ys)

    print 'X has size: ', X.shape
    try:
        print 'ys has size: ', ys.shape
    except:
        print 'ys has size: ', len(ys)

    # Dimension reduction
    from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
    from sklearn import preprocessing, decomposition
    from sklearn.decomposition import PCA
    from sklearn.svm import SVR
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.ensemble import RandomForestRegressor

    # Regression model
    reg = ''
    # Remove features with too low between-subject variance e.g. nulls
    variance_threshold = VarianceThreshold(threshold=.01)
    # Normalize the data so it has 0 mean normal variance
    scaler = preprocessing.StandardScaler()
    min_max_scaler = preprocessing.MinMaxScaler()

    #if method == 'Lasso':
    #    print 'Running ', method
    #    reg = Lasso()
    #    reg.fit(X ,ys)
    #    print("Data fitted with Lasso Regression")
    #    w = reg.coef_
    #    np.save('W_Lasso', w)
    if method == 'LassoCV':
        print 'Running ', method
        #variance_threshold = VarianceThreshold(threshold=0.01)
        lasso = LassoCV()

        reg = Pipeline([('variance_threshold', variance_threshold),
                        ('lasso', lasso)])
        reg.fit(X, ys)
        print("Data fitted with Lasso CV Regression")
        #w = reg.coef_
        #np.save('W_LassoCV', w)

    if method == 'SVR pipeline':
        print 'Running ', method
        #X = X.toarray()
        #print 'Converted sparse to dense nd array'

        #variance_threshold = VarianceThreshold(threshold=.01)
        # Here we use a classical univariate feature selection: removes all but the k highest scoring features
        #feature_selection = SelectKBest(f_regression, k=2000)
        # ('feature_selection', feature_selection),

        # PCA
        #pca = PCA(n_components=1000)

        # SVM regression
        svrLinear = SVR(kernel='linear', C=1e-4)
        #svrPloy2 = SVR(kernel='poly', degree=2)
        #svrSigmoid = SVR(kernel='sigmoid')
        svrRBF = SVR()
        svr = svrLinear
        #rForest = RandomForestRegressor()

        regs = [svrLinear]
        Cs = [1e-3, 1e-2]
        #gammas = [1e-8, 1e-7, 1e-6]

        pipe = Pipeline([
            ('variance_threshold', variance_threshold),
            ('scaler', scaler),
            ('svr', svrLinear),
        ])

        params = dict(
            variance_threshold=[variance_threshold],
            scaler=[scaler],
            svr=regs,
            svr__C=Cs,
        )

        # does cross-validation with 3-fold for each combination of kernels and Cs
        reg = GridSearchCV(pipe, param_grid=params, n_jobs=4, cv=5)
        #reg = pipe

        reg.fit(X, ys)
        print 'Data fitted with ', method
        print "Best parameters set found on development set:"
        print
        print(reg.best_params_)
        print
        #w = reg.coef_
        #np.save('W_LassoCV', w)

    prediction = []

    iters = 0
    test_files = sorted(os.listdir(test_data_path))
    smoothed_y_folder = parent_data_folder + 'set_test_smooth' + str(
        vSize) + '/'
    for f in test_files:
        if ('test_' in f and f.endswith('.mtx')
                and not simple) or ('test_' in f and f.endswith('.npy')
                                    and simple):
            iters = iters + 1
            pic_id = int(f.split('.')[0].split('_')[1])

            segmented_img = None
            if not simple:
                output_file = smoothed_y_folder + f + '.npy'
                if os.path.isfile(output_file):
                    print 'Compressed file %s found!!' % output_file
                    segmented_img = np.load(output_file)
                else:
                    segmented_img = io.mmread(test_data_path + f)
                    if method == 'SVR pipeline' or method == 'LassoCV':
                        segmented_img = segmented_img.toarray()
                        segmented_img = np.reshape(segmented_img,
                                                   (176, 208, 176))
                        segmented_img = smooth_img(segmented_img, vSize)

                        #np.save(output_file, segmented_img)
            else:
                segmented_img = np.load(test_data_path + f)

            res = reg.predict(segmented_img)
            prediction.append([pic_id, res[0]])
            gc.collect()
            print "Age prediction for image %d completed " % (pic_id)
            #print ''
            #print 'iters = ', iters

    with open(parent_data_folder + "predictions.csv", "wb") as f:
        f.write(b'ID,Prediction\n')
        for pred in prediction:
            f.write(str(pred[0]) + ',' + str(pred[1]) + '\n')
    print 'Done.'
Пример #5
0
    'vect__stop_words': ('english', None),
    'vect__max_features': (2500, 5000, 10000, None),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),
}
'''
To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'"

'''
if __name__ == '__main__':
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               scoring='accuracy',
                               cv=3)
    df = pd.read_csv('sms.csv')
    X, y, = df['message'], df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    grid_search.fit(X_train, y_train)
    print('最佳效果:%0.3f' % grid_search.best_score_)
    print('最优参数组合:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))
    predictions = grid_search.predict(X_test)
    print('准确率:', accuracy_score(y_test, predictions))
    print('精确率:', precision_score(y_test, predictions))
    print('召回率:', recall_score(y_test, predictions))
Пример #6
0
y_cv_train = y_cv_train.values.flatten()
y_sep_test = y_val.values.flatten()
y_cv_train[y_cv_train == 2] = 0
y_sep_test[y_sep_test == 2] = 0

X_train = X_cv_train
X_test = X_sep_test
y_train = y_cv_train
y_test = y_sep_test

# Using GridSearchCV to find the best values for C and gamma
C_range = 10.0**np.arange(-4, 4)
gamma_range = 10.0**np.arange(-10, 1)
param_grid = dict(gamma=gamma_range, C=C_range)
skf = cv.StratifiedKFold(y=y_train, n_folds=3)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=skf)
grid.fit(X_train, y_train)

# Print out parameters
crossclf = svm.SVC(probability=True, **grid.best_params_)
y_pred = crossclf.fit(X_train, y_train).predict(X_test)
print crossclf
print 'y_pred: ', y_train
print 'y_pred: ', y_pred
print "Best parameter", grid.best_params_  # {'C': 10.0, 'gamma': 0.001}
print "Cross-Validation score", cv.cross_val_score(crossclf, X_train,
                                                   y_train).mean()
print "Independent accuracy score", accuracy_score(y_test, y_pred)
print "Independent precision score", precision_score(y_test, y_pred)
print "Independent recall score", recall_score(y_test, y_pred)
print "Independent f1 score", f1_score(y_test, y_pred)
#creating parameters to fit into algortihm
parameters = {
    'n_estimators': [10, 20, 30, 50, 100],
    'max_features': [0.6, 0.2, 0.3],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4, 6]
}

#parameters = {'penalty':['l1', 'l2'],'C': np.logspace(0, 4, 10)}

# calculating accuracy score
acc_scorer = make_scorer(accuracy_score)

# Running grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, cv=5)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
best_clf = grid_obj.best_estimator_

# Fit the best parameter to the data.
best_clf.fit(X_train, y_train)

#making predictions
best_predictions = best_clf.predict(X_test)

#printing fbeta score and accuracy score of the optimized model .
print("\nOptimized Model\n------")
Пример #8
0
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

param_dist = {"base_estimator__max_depth": [1,2,3],
              "base_estimator__min_samples_split": [1,2],
              "base_estimator__min_samples_leaf": [1,2],
              "n_estimators": [2,3,5],
              "learning_rate":[0.4,0.6,0.8],
              "algorithm":["SAMME","SAMME.R"]
             }

cv = cross_validation.StratifiedShuffleSplit(y_train,n_iter = 4,random_state = 9)
f1score=make_scorer(f1_score, pos_label="yes") 

# build a classifier
dt_clf=DecisionTreeClassifier()
clf = AdaBoostClassifier(dt_clf)

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_dist,cv=cv,scoring=f1score)
gs_estimator=grid_search.fit(X_train,y_train)

print "Best model parameter:  " + str(gs_estimator.best_params_)
y_pred=grid_search.predict(X_test)
#print y_pred
gs_f1score=f1_score(y_test, y_pred,pos_label="yes")
print "f1 score: {:.5f}".format(gs_f1score)


# ###### 
Пример #9
0
#best_score_ : 최고 성능의 지표 값
#best_params_ : 최고 성능을 보이는 파라미터
#best_estimator_ : 최고 성능을 보이는 파라미터를 가진 모형
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

#pipe line 안에는 리스트 형태로 한번에 엮어서 진행할 절차를 넣어준다.
pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10., 100., 1000.]
param_grid = [{'clf__C':param_range, 'clf__kernel':['linear']},
              {'clf__C':param_range, 'clf__gamma':param_range, 'clf__kernel':['rbf']}]

#원래는 param_grid에다가 dict, list를 parameter 이름에 맞춰서(C, gamma, kernel) 이렇게만 해줘도 되지만, 지금은 pipeline이어서 clf__를 앞에 써준듯 하다.
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1)
# %time
gs = gs.fit(X, y)

print(gs.best_score_)
print(gs.best_params_)
gs.grid_scores_



#ParameterGrid : 파라미터를 조합하여 탐색 그리드를 생성해 주는 명령어로, iterator 역할을 한다.
#이거로 조합들을 만든다음 for문을 돌려서 진행하는 방식으로 탐색할 수 있음
from sklearn.grid_search import ParameterGrid
param_grid = {'a':[1, 2], 'b':[True, False]}
list(ParameterGrid(param_grid))
param_grid= [{'kernel':['linear']}, {'kernel':['rbf'], 'gamma':[1, 10]}]
Пример #10
0
    C['Class']=DF[i].iloc[:,c1]
    Classifier[i]=C.values[:,0]
    m=len(Classifier[i])
    df=DF[i].drop(DF[i].columns[c1],axis=1)
    l=len(df.columns)
    
    #Create Features Array
    Features[i]=df.values[:,0:l]
    
    fld=5
    state=12
    kf=KFold(m,n_folds=fld,shuffle=True,random_state=state)
    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=kf)
    grid.fit(Features[i], Classifier[i])
    
    print("The best parameters are %s with a score of %0.2f"
        % (grid.best_params_, grid.best_score_))
    Params[i]=grid.best_params_
print Params
SVMA=0
SVMC=Params[1]['C']
SVMG=Params[1]['gamma']
start=time.time()
kf=KFold(m,n_folds=fld,shuffle=True,random_state=state)
for train_index, test_index in kf:
    Clf_Train=[]
    Clf_Test=[]
    m_train=len(train_index)
Пример #11
0
print("Precision:", precision)
print("Recall:", recall)

#对最大特征数max_features、最小样本数min_samples_split、叶子节点最少样本数min_samples_leaf、决策树最大深度max_depth、内部节点再划分所需最小样本数min_samples_split做调参:
#param_test5= {'max_features':range(3,11,2),'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10),'n_estimators':range(10,100,20),'max_depth':range(3,14,2)}

param_test5 = {
    'min_samples_leaf': range(10, 30, 10),
    'n_estimators': range(50, 200, 50)
}

gsearch5 = GridSearchCV(estimator=RandomForestClassifier(max_depth=None,
                                                         min_samples_split=2,
                                                         max_features="auto",
                                                         max_leaf_nodes=None,
                                                         bootstrap=True),
                        param_grid=param_test5,
                        scoring='roc_auc',
                        iid=False,
                        cv=5)
gsearch5.fit(X_train, y_train)

print gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

#0.990622704291  10棵树
#[[21721    84]
# [  133  1203]]
#('Precision:', 0.96432304616161124)
#('Recall:', 0.94829838717428705)
#('Precision:', 0.99049738866427095)
#('Recall:', 0.99062270429108512)
Пример #12
0
#print clf.predict(features_test)
print clf.score(features_test, labels_test)
print "training time:", round(time()-t0, 3), "s"
'''
#########################################################
import numpy
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

features_train = features_train[:len(features_train) / 100]
labels_train = labels_train[:len(labels_train) / 100]

t0 = time()
clf = SVC(kernel='rbf', C=10000)
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
score = clf.score(features_test, labels_test)
print "training time:", round(time() - t0, 3), "s"

grid = GridSearchCV(clf, {
    'kernel': ['linear', 'rbf'],
    'C': [1, 10, 100, 1000, 10000]
}, 'accuracy')
grid.fit(features_train, labels_train)
best_params = grid.best_params_
model = grid.best_estimator_
score = grid.best_score_

print best_params, model, score

print numpy.count_nonzero(predictions)
Пример #13
0
import pandas as pd
df = pd.read_csv("book2.csv")
X = df.iloc[:, 0:14]
y = df.iloc[:, 14]

xgb_model = xgb.XGBClassifier()
optimization_dict = {
    'max_depth': [1, 2, 3, 4, 5, 6],
    'n_estimators': [50, 100, 200],
}

optimization_dict1 = {
    'subsample': [0.8, 0.9, 1],
    'max_delta_step': [0, 1, 2, 4],
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
}

model = GridSearchCV(xgb_model,
                     optimization_dict,
                     scoring='accuracy',
                     verbose=1)

model = GridSearchCV(xgb_model,
                     optimization_dict1,
                     scoring='accuracy',
                     verbose=1)

model.fit(X, y)
print(model.best_score_)
print(model.best_params_)
Пример #14
0
        mtry = np.sqrt(X.shape[1]).round()
        #    mtry=np.sqrt(n_components).round()
        rf = RandomForestClassifier(n_estimators=5000)
        gbm = GradientBoostingClassifier(n_estimators=10000,
                                         learning_rate=0.001)
        # Parameter Grids
        param_grid_rf = dict(max_features=np.arange(
            int(mtry - round(mtry / 2)), int(mtry + round(mtry / 2)), 2))
        param_grid_gbm = dict(max_depth=range(1, 10))
        #    param_grid=dict(max_features=range(5,100,5))
        param_dist = {"max_features": sp_randint(5, 100)}
        random_search_rf = RandomizedSearchCV(rf,
                                              param_distributions=param_dist,
                                              n_iter=40)
        grid_search_rf = GridSearchCV(estimator=rf,
                                      param_grid=param_grid_rf,
                                      cv=10)
        grid_search_gbm = GridSearchCV(estimator=gbm,
                                       param_grid=param_grid_gbm,
                                       cv=10)

        pipe1 = Pipeline([('feature_selection', feature_linearSVC),
                          ('classification', grid_search_rf)])

        pipe2 = Pipeline([('feature_selection', feature_RFECV),
                          ('classification', random_search_rf)])

        #    pipe3 = Pipeline([('feature_selection', feature_PCA),
        #                      ('classification', grid_search_rf)])
        #%%
        #Nested cross-validation
Пример #15
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

 

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                                          'C': [1, 10, 100, 1000]},
                                        {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
        print()

        print("Detailed classification report:")
        print()
Пример #16
0
y_train = final_train.pop('wage_class')
y_test = final_test.pop('wage_class')

cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]}
ind_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'seed': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic'
}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params),
                             cv_params,
                             scoring='accuracy',
                             cv=5,
                             n_jobs=-1)
optimized_GBM.fit(final_train, y_train)
GridSearchCV(cv=5,
             error_score='raise',
             estimator=xgb.XGBClassifier(base_score=0.5,
                                         colsample_bylevel=1,
                                         colsample_bytree=0.8,
                                         gamma=0,
                                         learning_rate=0.1,
                                         max_delta_step=0,
                                         max_depth=3,
                                         min_child_weight=1,
                                         missing=None,
                                         n_estimators=1000,
Пример #17
0
print("MSE for the test part2 is : ", np.mean((z2 - testY)**2))
print("MSE for the test part3 is : ", np.mean((z3 - testY)**2))
print("MSE for the test part4 is : ", np.mean((z4 - testY)**2))
print("MSE for the test part5 is : ", np.mean((z5 - testY)**2))

#%% Using the Linear Regression

#################################### Linear Regression
print("================== Linear Regression...")
clf0 = LinearRegression()
param = {
    "fit_intercept": [True, False],
    "normalize": [False],
    "copy_X": [True, False]
}
grid = GridSearchCV(clf0, param, n_jobs=1)

grid.fit(trainX1, trainY)
clf01 = LinearRegression(fit_intercept=grid.best_params_["fit_intercept"],
                         normalize=grid.best_params_["normalize"],
                         copy_X=grid.best_params_["copy_X"],
                         n_jobs=-1)
print("================== LR1 Ends...")

grid.fit(trainX2, trainY)
clf02 = LinearRegression(fit_intercept=grid.best_params_["fit_intercept"],
                         normalize=grid.best_params_["normalize"],
                         copy_X=grid.best_params_["copy_X"],
                         n_jobs=-1)
print("================== LR2 Ends...")
Пример #18
0
from sklearn.pipeline import make_pipeline

pca = RandomizedPCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)


from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target,
                                                random_state=42)


from sklearn.grid_search import GridSearchCV
param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

get_ipython().run_line_magic("time", " grid.fit(Xtrain, ytrain)")
print(grid.best_params_)


model = grid.best_estimator_
yfit = model.predict(Xtest)


fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
    axi.imshow(Xtest[i].reshape(62, 47), cmap='bone')
    axi.set(xticks=[], yticks=[])
    axi.set_ylabel(faces.target_names[yfit[i]].split()[-1],
                   color='black' if yfit[i] == ytest[i] else 'red')
Пример #19
0
    Decision Tree Regression --------------------------------------------------
"""

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.grid_search import GridSearchCV

# Tune Hyperparameters of DecisionTreeClassifier
parameters = {
    'max_depth': [2, 3, 4, 5, 10],
    'criterion': ["mse"],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 5],
    'max_leaf_nodes': [5, 7, 10, 12, 15]
}
grid_search_tree = GridSearchCV(DecisionTreeRegressor(), parameters, n_jobs=4)

grid_search_tree.fit(regressors_train_pca, target_train)
print(grid_search_tree.best_score_, grid_search_tree.best_params_)

# Train Best Model
regr_tree = DecisionTreeRegressor(max_depth=2,
                                  min_samples_leaf=1,
                                  criterion='mse',
                                  min_samples_split=2,
                                  max_leaf_nodes=10)
regr_tree.fit(regressors_train_pca, target_train)
predicted_tree = regr_tree.predict(regressors_test_pca)
# RMSE
math.sqrt(mean_squared_error(target_test, predicted_tree))
Пример #20
0
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
import pandas as pd
import numpy as np

df_train = pd.read_csv(
    "C:/Users/Shanu/PycharmProjects/Crime-data/communities.csv"
)  #skiprows=20,index_col=21)
df_train.replace('na', 0, inplace=True)
df_train.replace('?', 0, inplace=True)

X_train = df_train.values[:, 1:171]
Y_train = df_train.values[:, :1]

optimized_GBM = GridSearchCV(cv=5,
                             estimator=xgb.XGBRegressor(),
                             param_grid={
                                 'reg_alpha':
                                 np.linspace(np.float_power(10, -4),
                                             np.float_power(10, 1), 20)
                             },
                             refit=True,
                             scoring='neg_mean_squared_error',
                             verbose=1)
# Optimize for accuracy since that is the metric used in the Adult Data Set notation
optimized_GBM.fit(X_train, Y_train)

print(optimized_GBM.grid_scores_)
Пример #21
0
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)


###############################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
  'C': [1e3, 5e3, 1e4, 5e4, 1e5],
  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_


###############################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
Пример #22
0
 def gridsearchcv_train(self,alg,param_grid,train_predictor_set,train_target_set,cv,n_jobs):
     param_grid = {'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)}
     gsearch = GridSearchCV(estimator=alg,param_grid = param_grid,scoring='roc_auc',n_jobs=24,iid=False, cv=10,verbose=1)
     gsearch.fit(train_predictor_set,train_target_set)
     print gsearch.grid_scores_, gsearch.best_params_,     gsearch.best_score_
Пример #23
0
                                 scoring='roc_auc')
print roc_scores_svm.mean()
# Worse than PCA

# try with PCA
roc_scores_svm_pca = cross_val_score(svm,
                                     pca_df_small,
                                     response_series,
                                     cv=10,
                                     scoring='roc_auc')
print roc_scores_svm_pca.mean()

# let's do a grid search
param_grid = dict(kernel=['linear', 'poly', 'rbf', 'sigmoid'])

svm_grid = GridSearchCV(svm, param_grid, cv=10, scoring='roc_auc')
svm_grid.fit(explanatory_df, response_series)
best_estimator = svm_grid.best_estimator_
print best_estimator.kernel

# Linear is the best estimator score won
print svm_grid.best_score_
# best estimator was 77% - just below RFs
# Note: SVMs are more accurate than RFs with trending data!

####################################################
############# Out of Sample Testing ################
####################################################

conn = sqlite3.connect('C:\Users\garauste\Documents\SQLite\lahman2013.sqlite')
# new query to pull data post 2000
def train(args):
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1), map(os.path.split,
                                    map(os.path.dirname,
                                        labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GridSearchSvm':
        print("""
        Warning: In our experiences, using a grid search over SVM hyper-parameters only
        gives marginally better performance than a linear SVM with C=1 and
        is not worth the extra computations of performing a grid search.
        """)
        param_grid = [{
            'C': [1, 10, 100, 1000],
            'kernel': ['linear']
        }, {
            'C': [1, 10, 100, 1000],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']
        }]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    elif args.classifier == 'GMM':  # Doesn't work best
        clf = GMM(n_components=nClasses)

    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
        # works better with C = 1 and gamma = 2
        clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
    elif args.classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif args.classifier == 'GaussianNB':
        clf = GaussianNB()

    # ref: https://jessesw.com/Deep-Learning/
    elif args.classifier == 'DBN':
        from nolearn.dbn import DBN
        clf = DBN(
            [embeddings.shape[1], 500, labelsNum[-1:][0] + 1
             ],  # i/p nodes, hidden nodes, o/p nodes
            learn_rates=0.3,
            # Smaller steps mean a possibly more accurate result, but the
            # training will take longer
            learn_rate_decays=0.9,
            # a factor the initial learning rate will be multiplied by
            # after each iteration of the training
            epochs=300,  # no of iternation
            # dropouts = 0.25, # Express the percentage of nodes that
            # will be randomly dropped as a decimal.
            verbose=1)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                        ('clf', clf_final)])

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
Пример #25
0
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

###############################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
# If you are running sklearn version 0.17 or later, the expected argument is "balanced".
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

###############################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))
Пример #26
0
        'NVC0905_22_002_Ecog_c015_f1', 'NVC0905_22_002_Ecog_c015_f2',
        'NVC0905_22_002_Ecog_c015_f8', 'NVC0905_22_002_Ecog_c015_f9',
        'NVC0905_22_002_Ecog_c016_f1', 'NVC0905_22_002_Ecog_c016_f4',
        'NVC0905_22_002_Ecog_c016_f13', 'NVC0905_22_002_Ecog_c016_f16',
        'NVC0905_22_002_Ecog_c016_f17', 'NVC0905_22_002_Ecog_c016_f21',
        'NVC0905_22_002_Ecog_c016_f23'
    ]
    X_train, X_test, y_train, y_test = train_test_split(df[X_cols],
                                                        df['ictal_ind'],
                                                        test_size=0.3,
                                                        random_state=1)
    rf = RandomForestClassifier(random_state=1)
    rf.fit(X_train, y_train)
    probs = rf.predict_proba(X_test)[:, 1]
    print metrics.roc_auc_score(y_test, probs)
    print probs

    list_estimators = list(xrange(1, 30, 2)) + list(xrange(30, 101, 10))
    param_grid = dict(n_estimators=list_estimators)
    grid = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc')
    grid.fit(df[X_cols], df['ictal_ind'])

    # Plot the results of the grid search
    grid_mean_scores = [result[1] for result in grid.grid_scores_]
    plt.xlim([0, 100])
    plt.scatter(list_estimators, grid_mean_scores, s=40)
    plt.grid(True)
    plt.title('Tuning Random Forests for Dog 2')
    plt.ylabel('AUC for 5-fold CV')
    plt.xlabel('Number of Trees')
Пример #27
0
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
### Create cross-validation
sss = StratifiedShuffleSplit(labels, 50, random_state=42)

### Naive Bayes optimization
t0 = time()
# PCA
pca = PCA()
#Pipeline
pipeline = Pipeline([('scale', mm_scaler), ('SKB', SelectKBest()),
                     ('PCA', PCA()), ('NB', GaussianNB())])
# clf's parameters
parameters = {'SKB__k': [5, 6], 'PCA__n_components': [2, 3, 4]}
#GridSearchCV
gs = GridSearchCV(pipeline, parameters, cv=sss, scoring='f1')
gs.fit(features, labels)

clf_NB = gs.best_estimator_
tester.test_classifier(clf_NB, my_dataset, new_features_list)
print "done in %0.3fs" % (time() - t0)

### Decision Tree optimization
t0 = time()
# PCA
pca = PCA()
#Pipeline
pipeline = Pipeline([('scale', mm_scaler), ('SKB', SelectKBest()),
                     ('PCA', PCA()), ('DT', DecisionTreeClassifier())])
# clf's parameters
parameters = {
Пример #28
0
 def _get_SVM():
     tune_params = [{"C": [1, 5, 10, 100, 1000]}]
     return GridSearchCV(LinearSVC(), tune_params, scoring="f1")
Пример #29
0
    #'union__summary__tfidf__max_df': (0.8, 1.0),
    #'union__summary__tfidf__max_features': (5000,50000),
    'union__summary__best__n_components': (100, 200, 300),

    #'union__authors__countvec__max_features': (10, 50),

    #'clf__alpha': ( 0.000001, 0.0000001),
    #'clf__penalty': ('l2', 'l1'),
    #'clf__n_iter': (3, 5),
    'clf__C': (1, 2),
    'clf__solver': ('newton-cg', 'lbfgs'),
    'clf__multi_class': ('ovr', 'multinomial'),
}

if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
Пример #30
0
data["corr"], corr_lbl = pd.factorize(data.correct)

data_dv = pd.get_dummies(
    data[["ROI", "condition_side", "condition_type", "phase"]])
data_dv["pow"] = data.power

data_itc = pd.read_csv(data_path +
                       "alpha_mean_itc_data_extracted_phase_target.csv")
data_itc = data_itc.drop("mean", 1)

data_dv["itc"] = data_itc["itc"]

y = data["corr"].get_values()
X = data_dv.get_values()

cv = StratifiedShuffleSplit(y, n_iter=10)
ada_params = {
    "adaboostclassifier__n_estimators": np.arange(1, 50, 1),
    "adaboostclassifier__learning_rate": np.arange(0.01, 1, 0.1)
}

ada = AdaBoostClassifier
scaler_pipe = make_pipeline(StandardScaler(), AdaBoostClassifier())
grid = GridSearchCV(scaler_pipe, param_grid=ada_params, cv=cv)

ada_grid.fit(X, y)

ada = ada_grid.best_estimator_

scores = cross_val_score(ada, X, y, cv=cv, scoring="roc_auc")