from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]
import pdb; pdb.set_trace()
Пример #2
0
if feat_select == 1:
    '''Three steps:
       1) Run Feature Selection
       2) Get lists of selected and non-selected features
       3) Filter columns from original dataset
       '''

    print('--FEATURE SELECTION ON--', '\n')

    ##1) Run Feature Selection #######
    if fs_type == 1:
        #Stepwise Recursive Backwards Feature removal
        if binning == 1:
            clf = RandomForestClassifier(n_estimators=200,
                                         max_depth=None,
                                         min_samples_split=3,
                                         criterion='entropy',
                                         random_state=rand_st)
            sel = RFE(clf, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')
        if binning == 0:
            rgr = RandomForestRegressor(n_estimators=500,
                                        max_depth=None,
                                        min_samples_split=3,
                                        criterion='mse',
                                        random_state=rand_st)
            sel = RFE(rgr, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')

        fit_mod = sel.fit(data_np, target_np)
        print(sel.ranking_)
Пример #3
0
def mcode(ite):
    R = 0.5

    e11 = []
    e12 = []
    e21 = []
    e22 = []
    e31 = []
    e32 = []
    e41 = []
    e42 = []
    e51 = []
    e52 = []
    e8 = []
    elaterf = []
    elaterfdis = []

    #data reading
    if ite == 0:
        ss = "lowGrade"
        url = '../lowGrade/text_lg_1.csv'
        dataframe = pandas.read_csv(url, header=None)
        array = dataframe.values
        X = array
        Y = pandas.read_csv('../lowGrade/label_lowGrade.csv', header=None)
        Y = Y.values
        Y = np.ravel(Y)
        print(Y.shape)

        for i in range(4):
            url = '../lowGrade/text_lg_' + str(i + 2) + '.csv'
            dataframe = pandas.read_csv(url, header=None)
            array = dataframe.values
            X1 = array
            print(X1.shape)
            X = np.concatenate((X, X1), axis=1)

        Xnew1 = X[:10, 0:1680]
        Xnew2 = X[:10, 1680:3360]
        Xnew3 = X[:10, 3360:5040]
        Xnew4 = X[:10, 5040:6720]
        Xnew5 = X[:10, 6720:6745]
    elif ite == 1:
        ss = "IDHCodel"
        url = '../IDHCodel/text_pr_1.csv'
        dataframe = pandas.read_csv(url, header=None)
        array = dataframe.values
        X = array
        Y = pandas.read_csv('../IDHCodel/label_IDHCodel.csv', header=None)
        Y = Y.values
        Y = np.ravel(Y)
        print(Y.shape)
        Y = Y[:10]
        for i in range(4):
            url = '../IDHCodel/text_pr_' + str(i + 2) + '.csv'
            dataframe = pandas.read_csv(url, header=None)
            array = dataframe.values
            X1 = array
            print(X1.shape)
            X = np.concatenate((X, X1), axis=1)

        Xnew1 = X[:, 0:1680]
        Xnew2 = X[:, 1680:3360]
        Xnew3 = X[:, 3360:5040]
        Xnew4 = X[:, 5040:6720]
        Xnew5 = X[:, 6720:6745]
    elif ite == 2:
        ss = "nonIDH1"
        url = '../nonIDH1/text_nonIDH1_1.csv'
        dataframe = pandas.read_csv(url, header=None)
        array = dataframe.values
        X = array
        Y = pandas.read_csv('../nonIDH1/label_nonIDH1.csv', header=None)
        Y = Y.values
        Y = np.ravel(Y)
        print(Y.shape)

        for i in range(4):
            url = '../nonIDH1/text_nonIDH1_' + str(i + 2) + '.csv'
            dataframe = pandas.read_csv(url, header=None)
            array = dataframe.values
            X1 = array
            print(X1.shape)
            X = np.concatenate((X, X1), axis=1)

        Xnew1 = X[:, 0:1680]
        Xnew2 = X[:, 1680:3360]
        Xnew3 = X[:, 3360:5040]
        Xnew4 = X[:, 5040:6720]
        Xnew5 = X[:, 6720:6745]
    else:
        ss = "progression"
        url = '../progression/text_pr_1.csv'
        dataframe = pandas.read_csv(url, header=None)
        array = dataframe.values
        X = array
        Y = pandas.read_csv('../progression/label_progression.csv',
                            header=None)
        Y = Y.values
        Y = np.ravel(Y)
        print(Y.shape)

        for i in range(4):
            url = '../progression/text_pr_' + str(i + 2) + '.csv'
            dataframe = pandas.read_csv(url, header=None)
            array = dataframe.values
            X1 = array
            print(X1.shape)
            X = np.concatenate((X, X1), axis=1)

        Xnew1 = X[:, 0:1680]
        Xnew2 = X[:, 1680:3360]
        Xnew3 = X[:, 3360:5040]
        Xnew4 = X[:, 5040:6720]
        Xnew5 = X[:, 6720:6745]
    testfile = open(("RR" + ss + "%f_%f.txt" % (R, ite)), 'w')
    erfsvm = []
    for ii in range(1):

        seed = 1000 + ii
        train_indices, test_indices = splitdata(X=X[:10, :],
                                                Y=Y,
                                                ratio=R,
                                                seed=seed)
        print("Start rest")
        # view1

        X_features_train1, X_features_test1, w1, pred1 = RR_rf_dis(
            n_trees=10,
            X=Xnew1,
            Y=Y,
            train_indices=train_indices,
            test_indices=test_indices,
            seed=seed)
        m12 = RandomForestClassifier(n_estimators=500,
                                     random_state=seed,
                                     oob_score=True,
                                     n_jobs=1).fit(X_features_train1,
                                                   Y[train_indices])
        pre1 = m12.predict(X_features_test1)
        print("finished view1")
        #e12.append(m12.score(X_features_test1, Y[test_indices]))
        #e11.append(w1)
        # view 2

        X_features_train2, X_features_test2, w2, pred2 = RR_rf_dis(
            n_trees=500,
            X=Xnew2,
            Y=Y,
            train_indices=train_indices,
            test_indices=test_indices,
            seed=seed)
        m22 = RandomForestClassifier(n_estimators=500,
                                     random_state=seed,
                                     oob_score=True,
                                     n_jobs=1).fit(X_features_train2,
                                                   Y[train_indices])
        pre2 = m22.predict(X_features_test2)
        #e22.append(m22.score(X_features_test2, Y[test_indices]))
        #e21.append(w2)

        # view 3

        X_features_train3, X_features_test3, w3, pred3 = RR_rf_dis(
            n_trees=500,
            X=Xnew3,
            Y=Y,
            train_indices=train_indices,
            test_indices=test_indices,
            seed=seed)
        m32 = RandomForestClassifier(n_estimators=500,
                                     random_state=seed,
                                     oob_score=True,
                                     n_jobs=1).fit(X_features_train3,
                                                   Y[train_indices])
        pre3 = m32.predict(X_features_test3)
        #e32.append(m32.score(X_features_test3, Y[test_indices]))
        #e31.append(w3)

        # view 4

        X_features_train4, X_features_test4, w4, pred4 = RR_rf_dis(
            n_trees=500,
            X=Xnew4,
            Y=Y,
            train_indices=train_indices,
            test_indices=test_indices,
            seed=seed)
        m42 = RandomForestClassifier(n_estimators=500,
                                     random_state=seed,
                                     oob_score=True,
                                     n_jobs=1).fit(X_features_train4,
                                                   Y[train_indices])
        pre4 = m42.predict(X_features_test4)
        #e42.append(m42.score(X_features_test4, Y[test_indices]))
        #e41.append(w4)

        # view 5

        X_features_train5, X_features_test5, w5, pred5 = RR_rf_dis(
            n_trees=500,
            X=Xnew5,
            Y=Y,
            train_indices=train_indices,
            test_indices=test_indices,
            seed=seed)
        m52 = RandomForestClassifier(n_estimators=500,
                                     random_state=seed,
                                     oob_score=True,
                                     n_jobs=1).fit(X_features_train5,
                                                   Y[train_indices])
        pre5 = m52.predict(X_features_test5)
        #e52.append(m52.score(X_features_test5, Y[test_indices]))
        #e51.append(w5)

        # Late RF
        resall1 = np.column_stack((pred1, pred2, pred3, pred4, pred5))
        Laterf = list(range(len(test_indices)))
        for i in range(len(test_indices)):
            Laterf[i], empty = Counter(resall1[i]).most_common()[0]
        LRF = accuracy_score(Y[test_indices], Laterf)
        elaterf.append(LRF)
        # Late RF dis
        resall = np.column_stack((pre1, pre2, pre3, pre4, pre5))
        LSVTres = list(range(len(test_indices)))
        for i in range(len(test_indices)):
            LSVTres[i], empty = Counter(resall[i]).most_common()[0]
        LSVTscore = accuracy_score(Y[test_indices], LSVTres)
        elaterfdis.append(LSVTscore)
        # multi view
        X_features_trainm = (X_features_train1 + X_features_train2 +
                             X_features_train3 + X_features_train4 +
                             X_features_train5) / 5
        X_features_testm = (X_features_test1 + X_features_test2 +
                            X_features_test3 + X_features_test4 +
                            X_features_test5) / 5
        mv = RandomForestClassifier(n_estimators=500,
                                    random_state=seed,
                                    oob_score=True,
                                    n_jobs=1).fit(X_features_trainm,
                                                  Y[train_indices])
        e8.append(mv.score(X_features_testm, Y[test_indices]))

        # RFSVM
        c = nLsvm_patatune(train_x=X_features_trainm,
                           train_y=Y[train_indices],
                           test_x=X_features_testm,
                           test_y=Y[test_indices])

        clf = SVC(C=c, kernel='precomputed')
        clf.fit(X_features_trainm, Y[train_indices])
        erfsvm.append(clf.score(X_features_testm, Y[test_indices]))
    testfile.write("RFSVM&%s pm%s & " %
                   (floored_percentage(np.mean(erfsvm), 2),
                    floored_percentage(np.std(erfsvm), 2)) + '\n')
    testfile.write("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2),
                                          floored_percentage(np.std(e8), 2)) +
                   '\n')
    testfile.write(" LATERF&%s pm%s &" %
                   (floored_percentage(np.mean(elaterf), 2),
                    floored_percentage(np.std(elaterf), 2)) + '\n')
    testfile.write(" LATERFDIS&%s pm%s & " %
                   (floored_percentage(np.mean(elaterfdis), 2),
                    floored_percentage(np.std(elaterfdis), 2)) + '\n')
    print(ss)
    print("RFSVM&%s pm%s & " % (floored_percentage(np.mean(erfsvm), 2),
                                floored_percentage(np.std(erfsvm), 2)) + '\n')
    print("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2),
                                 floored_percentage(np.std(e8), 2)) + '\n')
    print(" LATERF&%s pm%s &" % (floored_percentage(np.mean(elaterf), 2),
                                 floored_percentage(np.std(elaterf), 2)) +
          '\n')
    print(" LATERFDIS&%s pm%s & " %
          (floored_percentage(np.mean(elaterfdis), 2),
           floored_percentage(np.std(elaterfdis), 2)) + '\n')
Пример #4
0
X = df.drop(['ticker'], axis=1).iloc[1:, :]
y = df.pct_chg[1:] > 0.01
#y = df.pct_chg.shift(-1)
y = df.pct_chg
X_train, X_test, y_train, y_test = train_test_split(X, y[:-1], test_size=0.02, random_state=42)

y = df.adjclose
X_train, X_test, y_train, y_test = train_test_split(X.drop(['close'], axis=1), y, test_size=0.02, random_state=42)

X_train = X.drop(['close'], axis=1).iloc[:-30,:]
X_test = X.drop(['close'], axis=1).iloc[-30:,:]
y_train = y[:-31]
y_test = y[-30:]

# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)

pred_ = pd.Series(rf.predict(y_test.values.reshape(-1,1)))

pred = pd.Series(rf.predict(y_test.values.reshape(-1,1)))

pred_.set_axis(y_test.axes, inplace=True)
errors_ = pd.DataFrame(deepcopy(y_test))
errors_['preds'] = pred_.values
confusion_matrix = pd.crosstab(errors_['pct_chg'], errors_['preds'], rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)

Пример #5
0
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV

dados_dengue = pd.read_csv('dados/caso-dengue2018_C.csv', delimiter=';',  low_memory=False)

X = dados_dengue.drop(['tp_sexo','tp_classificacao_final','tp_criterio_confirmacao', 'resultado'], axis=1)
y = dados_dengue['resultado']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                             max_depth=10, max_features='auto', max_leaf_nodes=None,
                             min_impurity_decrease=0.0, min_impurity_split=None,
                             min_samples_leaf=1, min_samples_split=2,
                             min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
                             oob_score=False, random_state=0, verbose=0, warm_start=False)
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)



param_grid = [
{'n_estimators': [100, 250, 500], 'max_features': [5, 10, 'auto'],
 'max_depth': [10, 50, None], 'bootstrap': [True, False]}
]

grid_search_forest = GridSearchCV(rfc, param_grid, cv=10, scoring='roc_auc')
grid_search_forest.fit(X_train, y_train)
Пример #6
0

'''
    logistic regression
'''
lr = LogisticRegression().fit(train, yTrain)
yhat = lr.predict(test)
result = gen_result(test_id, yhat)
result.to_csv('./data/submission2.csv', index=False)
print('logistic regression finished!')
    

'''
    random forest
'''
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
classifier.fit(train, yTrain)
y_pred = classifier.predict(test)
result = gen_result(test_id, y_pred)
result.to_csv('./data/submission3.csv', index=False)
print('random forest finished!')

'''
    Ada boost
'''
ada_params = {
    'n_estimators': 200,
    'learning_rate' : 0.75
}

clf = AdaBoostClassifier(**ada_params)
Пример #7
0
# Not sure which one is used by mljar
# Based on trial/error, chose 2016 for the constructor and cv_state for the train_test_split
# Documented in diary/_posts/2017-06-29....md
random_seed = [
    2016, clf_mlj.selected_algorithm.params['random_seed'],
    clf_mlj.selected_algorithm.params['train_params']['cv_state'], None
]

########################
print("Random forest with same params")
i = 0
j = 2
clf_skl = RandomForestClassifier(
    n_estimators=5,
    criterion=mljar_fit_params['criterion'],
    max_features=mljar_fit_params['max_features'],
    min_samples_split=mljar_fit_params['min_samples_split'],
    min_samples_leaf=mljar_fit_params['min_samples_leaf'],
    random_state=random_seed[i])

# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
# skf = 5
skf = StratifiedKFold(n_splits=validation_kfolds,
                      shuffle=True,
                      random_state=random_seed[j])

# http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html
clf_skl_sig = CalibratedClassifierCV(clf_skl, cv=skf,
                                     method='isotonic')  #sigmoid')
clf_skl_sig.fit(X, y)
Пример #8
0
# scores = ['precision', 'recall']
# from sklearn.model_selection import GridSearchCV
# for score in scores:    
#     #model = GridSearchCV(SVC(), tuned_parameters, cv=5,scoring='%s_macro' % score)
#     model = GridSearchCV(RandomForestClassifier(), tuned_parameters,scoring='%s_macro' % score)
#     model.fit(xtrain,ytrain)
#     test_pred = model.predict(xtest)
#     train_pred = model.predict(xtrain)
#     from sklearn.metrics import confusion_matrix
#     cfmatrix1 = confusion_matrix(ytest,test_pred)
#     cfmatrix2 = confusion_matrix(ytrain,train_pred)
#     print cfmatrix1
#     print cfmatrix2
# print("Best parameters set found on development set:")
# print(model.best_params_)

#model = SVC(kernel= 'rbf', C= 100, gamma= 0.0001)
#model = SVC(kernel='linear', C=10)
model = RandomForestClassifier(n_estimators=10)
#model = RandomForestClassifier()
model.fit(xtrain,ytrain)
test_pred = model.predict(xtest)
train_pred = model.predict(xtrain)
from sklearn.metrics import confusion_matrix
cfmatrix1 = confusion_matrix(ytest,test_pred)
cfmatrix2 = confusion_matrix(ytrain,train_pred)
print cfmatrix1
print cfmatrix2

    
def random_forest(X,y):
    model_tree = RandomForestClassifier(random_state=100, n_estimators=50)
    sel_rfe_tree = RFE(estimator=model_tree, n_features_to_select=8, step=1)
    X_train_rfe_tree = sel_rfe_tree.fit_transform(X, y)
    return sel_rfe_tree.get_support()
Пример #10
0
#Data for training
from sklearn.model_selection import train_test_split
y = df['Diabetes']
X = df.drop('Diabetes', axis=1)

from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(kind='borderline2').fit_sample(X, y)
YR = pd.Series(y_resampled)
XR = pd.DataFrame(X_resampled)

X_train, X_test, y_train, y_test = train_test_split(XR, YR, random_state=0)

#Model creation:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=15,
                             max_depth=None,
                             min_samples_split=4,
                             random_state=0)
#Training
clf = clf.fit(X_train, y_train)
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, X_test, y_test)
from sklearn.metrics import log_loss, f1_score, precision_score, accuracy_score, confusion_matrix, roc_curve, auc
yrdn_pre = clf.predict(X_test)
fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

Result = pd.DataFrame()
Result['Test'] = ["Logloss", "F1 Score", "Precision", "Accuracy", 'ROC AUC']
Result['Random F'] = [
    log_loss(y_test, yrdn_pre),
    f1_score(y_test, yrdn_pre),
Пример #11
0
def third_generation(X, y, size=200, seed=None):
    mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\
                                            [0, 0.2, 0.5, 0.9],
                                            [0.1, 0.3, 0.6]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ),
                      momentum=m,
                      learning_rate_init=a) for (h, m, a) in mlp_parameters
    ]
    mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters]

    neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)]
    weighting_methods = ['uniform', 'distance']
    knn_clf = [
        KNeighborsClassifier(n_neighbors=nn, weights=w)
        for (nn, w) in itertools.product(neigbhors_number, weighting_methods)
    ]
    knn_name = [
        'knn_{0}_{1}'.format(*param) for param in itertools.product(
            neigbhors_number, ['uniform', 'distance'])
    ]
    C = np.logspace(-3, 7, num=11)
    degree = [2, 3, 4]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    svm_clf_poly = [
        SVC(C=c, kernel='poly', degree=d)
        for (c, d) in itertools.product(C, degree)
    ]
    svm_clf_poly_name = [
        'svm_poly_{0}_{1}'.format(*param)
        for param in itertools.product(C, degree)
    ]
    svm_clf_rbf = [
        SVC(C=c, kernel='rbf', gamma=g)
        for (c, g) in itertools.product(C, gamma)
    ]
    svm_clf_rbf_name = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]

    dt_params = list(itertools.product(['gini', 'entropy'], \
                                       [1, 2, 3, 4, 5, None], \
                                       [None, 'sqrt', 'log2'], \
                                       ['best', 'random']))
    dt_clf = [
        DecisionTreeClassifier(criterion=c,
                               max_depth=d,
                               max_features=f,
                               splitter=s) for (c, d, f, s) in dt_params
    ]
    dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    et_clf = [
        ExtraTreeClassifier(criterion=c,
                            max_depth=d,
                            max_features=f,
                            splitter=s) for (c, d, f, s) in dt_params
    ]
    et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    ada_params = list(itertools.product([2**i for i in range(1, 14)], \
                                        [1, 2, 3]))
    ada_dt_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=DecisionTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_et_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=ExtraTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params]
    ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params]

    nb_bag_est = 50
    nb_bag_stumps = 200
    bag_dt = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=DecisionTreeClassifier())
    bag_et = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=ExtraTreeClassifier())
    bag_stumps = BaggingClassifier(
        n_estimators=nb_bag_stumps,
        base_estimator=DecisionTreeClassifier(max_depth=1))
    bag_dt.fit(X, y)
    bag_et.fit(X, y)
    bag_stumps.fit(X, y)
    dt_bag_clf = bag_dt.estimators_
    et_bag_clf = bag_et.estimators_
    stump_bag_clf = bag_stumps.estimators_
    dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    stump_bag_name = [
        'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps)
    ]

    bag_dt_clf = [bag_dt]
    bag_et_clf = [bag_dt]
    bag_stump_clf = [bag_stumps]
    bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))]
    bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))]
    bag_stump_name = ['bag_stump_{0}'.format(str(200))]

    nb_rf = 15
    rf = RandomForestClassifier(n_estimators=nb_rf)
    rf.fit(X, y)
    dt_rf_clf = rf.estimators_
    dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)]

    log_parameters = list(itertools.product(['l1', 'l2'],\
                                            np.logspace(-5, 9, num=15),
                                            [True, False]))
    log_clf = [
        LogisticRegression(penalty=l, C=c, fit_intercept=f)
        for (l, c, f) in log_parameters
    ]
    log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters]

    sgd_parameters = list(
        itertools.product([
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1)))
    sgd_clf = [
        SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1)
        for (l, p, f, l1) in sgd_parameters
    ]
    sgd_name = [
        'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters
    ]

    pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \
                dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \
                log_clf + sgd_clf
    pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \
                ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \
                bag_stump_name + dt_rf_name + log_name + sgd_name

    for model in pool:
        if not check_model_is_fitted(model, X[0, :].reshape((1, -1))):
            model.fit(X, y)

    np.random.seed(seed)
    order = np.random.permutation(range(len(pool)))
    estimators = [pool[i] for i in order[:size]]

    return estimators, pool_name
def createModel(data, scoring='precision', drop_backers_count = False, drop_staff_pick = True):
    
    data = featureEngineering.prepDataFrameForPreprocessor(data, drop_backers_count = drop_backers_count, drop_staff_pick = drop_staff_pick)
    
    print(data.columns)
    
    preprocessor = featureEngineering.fitPreprocessor(data)
    
    X = data.drop("state", axis=1)
    y = data["state"] 
    
    print("before preprocessing")
    
    X = preprocessor.transform(X)
    
    print("features engineered")
    print("X",X.shape)
    print("y",y.shape)
    rf_model = RandomForestClassifier()

    param_rf = {
                "n_estimators":[1000],
                "criterion":['entropy'],
                "max_depth":[None],
                "min_samples_split":[2],
                "min_samples_leaf":[1],
                "min_weight_fraction_leaf":[0.0],
                "max_features":['auto'],
                "max_leaf_nodes":[None],
                "min_impurity_decrease":[0.0],
                "min_impurity_split":[None],
                "bootstrap":[True],
                "oob_score":[False],
                "n_jobs":[None],
                "random_state":[RSEED],
                "verbose":[0],
                "warm_start":[False],
                "class_weight":[None],
                "ccp_alpha":[0.0],
                "max_samples":[None],
                }

    grid_rf = GridSearchCV(rf_model,
                               param_grid=param_rf,
                               cv=5, 
                               scoring=scoring,
                               verbose=5, 
                               n_jobs=-1)

    grid_rf.fit(X,y)
    print("model trained")

    rf_model = grid_rf.best_estimator_
    #y_log_pred_test = lg_model.predict(X_test)

    filename = './models/modelBackerRF.sav'
    pickle.dump(rf_model, open(filename, 'wb'))
    print("model saved")
    
    filename = './models/preprocessorBackerRF.sav'
    pickle.dump(preprocessor, open(filename, 'wb'))
    print("preprocessor saved")
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", type=str, default="knn",
        help="type of python machine learning model to use")
args = vars(ap.parse_args())

# define the dictionary of models our script can use, where the key
# to the dictionary is the name of the model (supplied via command
# line argument) and the value is the model itself
models = {
        "knn": KNeighborsClassifier(n_neighbors=1),
        "naive_bayes": GaussianNB(),
        "logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
        "svm": SVC(kernel="rbf", gamma="auto"),
        "decision_tree": DecisionTreeClassifier(),
        "random_forest": RandomForestClassifier(n_estimators=100),
        "mlp": MLPClassifier()
}

# load the Iris dataset and perform a training and testing split,
# using 75% of the data for training and 25% for evaluation
print("[INFO] loading data...")
dataset = load_iris()
(trainX, testX, trainY, testY) = train_test_split(dataset.data,
        dataset.target, random_state=3, test_size=0.25)

# train the model
print("[INFO] using '{}' model".format(args["model"]))
model = models[args["model"]]
model.fit(trainX, trainY)
dataset = pd.DataFrame(X)
dataset['Label'] = Y
print(dataset['Label'].unique())
print(dataset['Label'].value_counts())

##If we do not want to include pixels with value 0 
##e.g. Sometimes unlabeled pixels may be given a value 0.
dataset = dataset[dataset['Label'] != 0]

#Redefine X and Y for Random Forest
X_for_RF = dataset.drop(labels = ['Label'], axis=1)
Y_for_RF = dataset['Label']

#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 30, random_state = 42)

# Train the model on training data
model.fit(X_for_RF, Y_for_RF) 

#############################################

#Save model for future use
filename = 'RF_model.sav'
pickle.dump(model, open(filename, 'wb'))

#Load model.... 
loaded_model = pickle.load(open(filename, 'rb'))

#Test on a different image
#READ EXTERNAL IMAGE...
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Model Building #####
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0,
                                    n_estimators=100,
                                    criterion='entropy')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame(
a[1].title.set_text('relative frequency')
sb.heatmap(m2, annot=True, ax=a[1])
plt.show()
print(m1)
print(m2)

# In[107]:

print(classification_report(yte, ypr))
print(accuracy_score(yte, ypr))

# # Random Forest

# In[111]:

rnf_cls = RandomForestClassifier(criterion='entropy', random_state=0)
rnf_cls.fit(Xtr.toarray(), ytr)

# In[112]:

ypr = rnf_cls.predict(Xte.toarray())

# In[113]:

f, a = plt.subplots(1, 2, figsize=(20, 8))
m1, m2 = confusion_matrix(yte, ypr), confusion_matrix(yte,
                                                      ypr,
                                                      normalize='true')
a[0].title.set_text('absolute frequency')
sb.heatmap(m1, annot=True, ax=a[0])
a[1].title.set_text('relative frequency')
Пример #17
0

knn.fit(X_train_std[:, k3], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))



# # Assessing feature importance with Random Forests




feat_labels = df_wine.columns[1:]

forest = RandomForestClassifier(n_estimators=500,
                                random_state=1)

forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            feat_labels[indices[f]], 
                            importances[indices[f]]))

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), 
        importances[indices],
        align='center')
Пример #18
0
    n=labels.size, classes=labels))

#Print the matrix size of the image and the training data
print('Our img matrix is sized: {sz}'.format(sz=img.shape))
print('Our roi array is sized: {sz}'.format(sz=roi.shape))

#Initialize the feacture and the training data
X = img[:, :, :]
y = roi[roi > 0]

#Extra: cross validation with 5 splits

kf = KFold(n_splits=5, shuffle=True, random_state=2)

# Initialize our model with 500 trees
rf = RandomForestClassifier(n_estimators=10, oob_score=True)

#Split the feacture and training sample into 5
for Train_index, Test_index in kf.split(X):
    X_Train, X_Test = X[Train_index], X[Test_index]
    y_Train, y_Test = y[Train_index], y[Test_index]
    print('Our X_Train is sized: {sz}'.format(sz=X_Train.shape))
    print('Our X_Test is sized: {sz}'.format(sz=X_Test.shape))

    print('Our y_Train is sized: {sz}'.format(sz=y_Train.shape))
    print('Our y_Test is sized: {sz}'.format(sz=y_Test.shape))

    nsamples, nx, ny = X_Train.shape
    d2_X_Train = X_Train.reshape((nsamples, nx * ny))
    print('Our d2_X_Train is sized: {sz}'.format(sz=d2_X_Train.shape))
    nsamples, nx, ny = X_Test.shape
Пример #19
0
def cross_validation(list_models, X_train, y_train, scoring, cv):

    fitted_models = {key: None for key in list_models}

    for eModel in list_models:

        if eModel == 'SGDClassifier':

            from sklearn.linear_model import SGDClassifier
            clf_sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
            scores = cross_val_score(clf_sgd,
                                     X_train,
                                     y_train,
                                     scoring=scoring,
                                     cv=cv)

            fitted_models[eModel] = {}
            fitted_models[eModel]['model'] = clf_sgd
            fitted_models[eModel]['Scores'] = scores
            fitted_models[eModel]['Mean'] = scores.mean()
            fitted_models[eModel]['Standard deviation'] = scores.std()

            y_scores = cross_val_predict(clf_sgd,
                                         X_train,
                                         y_train,
                                         cv=cv,
                                         method='decision_function')

            classification_performance_measure(fitted_models, eModel, clf_sgd,
                                               X_train, y_train, cv, y_scores)

        if eModel == 'RandomForestClassifier':

            from sklearn.ensemble import RandomForestClassifier
            clf_forest = RandomForestClassifier(n_estimators=100,
                                                random_state=42)

            y_probas_forest = cross_val_predict(clf_forest,
                                                X_train,
                                                y_train,
                                                cv=cv,
                                                method="predict_proba")
            y_scores = y_probas_forest[:, 1]  # Positive class probabilities

            fitted_models[eModel] = {}
            fitted_models[eModel]['model'] = clf_forest

            classification_performance_measure(fitted_models, eModel,
                                               clf_forest, X_train, y_train,
                                               cv, y_scores)

        if eModel == 'LogisticRegression':

            from sklearn.linear_model import LogisticRegression
            train_samples = X_train.shape[0]
            #reg_log = LogisticRegression()
            #reg_log = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1)
            reg_log = LogisticRegression(solver='newton-cg')

            y_probas_log = cross_val_predict(reg_log,
                                             X_train,
                                             y_train,
                                             cv=cv,
                                             method="predict_proba")
            y_scores = y_probas_log[:, 1]  # Positive class probabilities

            fitted_models[eModel] = {}
            fitted_models[eModel]['model'] = reg_log

            classification_performance_measure(fitted_models, eModel, reg_log,
                                               X_train, y_train, cv, y_scores)

        if eModel == 'SGDRegressor':

            from sklearn.linear_model import SGSRegressor
            reg_sgd = SGSRegressor(max_iter=1000,
                                   tol=1e-3,
                                   penalty=None,
                                   eta0=0.1)

            print('MISSING FITTING')

            fitted_models[eModel] = {}
            fitted_models[eModel]['model'] = reg_sgd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier
from vecstack import stacking
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from drop_highlycorelated import clf,xtrain,ytrain,xtest,ytest,X_important_train,X_important_test

models = [
    svm.SVC(kernel='linear',C=1),
        
    RandomForestClassifier(random_state=42, n_jobs=-1, 
                          n_estimators=1000, max_depth=3),
    BaggingClassifier(svm.SVC(kernel='linear',C=1))
]

S_train, S_test = stacking(models,                     # list of models
                           X_important_train, ytrain, X_important_test,      # data,            # regression task (if you need 
                                                       #     classification - set to False)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                           regression=True,                     #     set in each fold and find mean
                           save_dir=None,              # do not save result and log (to save 
                                                       #     in current dir - set to '.')
                           metric=mean_absolute_error, # metric: callable
                           n_folds=4,                  # number of folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproducibility
                           verbose=2)
Пример #21
0
def predefined_estimators(estimator, random_state, n_jobs, p):
    """
    Provides the classifiers and parameters using by the module

    Parameters
    -----------
    estimator : str
        Name of scikit learn estimator.

    random_state : Any number
        Seed to use in randomized components.

    n_jobs : int
        Number of processing cores to use.

    p : dict
        Classifier setttings (keys) and values.

    Returns
    -------
    clf : object
        Scikit-learn classifier object

    mode : str
        Flag to indicate whether classifier performs classification or
        regression.
    """
    try:
        from sklearn.experimental import enable_hist_gradient_boosting
    except ImportError:
        pass

    from sklearn.linear_model import (
        LogisticRegression,
        LinearRegression,
        SGDRegressor,
        SGDClassifier,
    )
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.ensemble import (
        RandomForestClassifier,
        RandomForestRegressor,
        ExtraTreesClassifier,
        ExtraTreesRegressor,
    )
    from sklearn.ensemble import (GradientBoostingClassifier,
                                  GradientBoostingRegressor)
    from sklearn.svm import SVC, SVR
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from sklearn.neural_network import MLPClassifier, MLPRegressor

    estimators = {
        "SVC":
        SVC(C=p["C"], probability=True, random_state=random_state),
        "SVR":
        SVR(C=p["C"], epsilon=p["epsilon"]),
        "LogisticRegression":
        LogisticRegression(
            C=p["C"],
            solver="liblinear",
            random_state=random_state,
            multi_class="auto",
            n_jobs=1,
            fit_intercept=True,
        ),
        "LinearRegression":
        LinearRegression(n_jobs=n_jobs, fit_intercept=True),
        "SGDClassifier":
        SGDClassifier(
            penalty=p["penalty"],
            alpha=p["alpha"],
            l1_ratio=p["l1_ratio"],
            n_jobs=n_jobs,
            random_state=random_state,
        ),
        "SGDRegressor":
        SGDRegressor(
            penalty=p["penalty"],
            alpha=p["alpha"],
            l1_ratio=p["l1_ratio"],
            random_state=random_state,
        ),
        "DecisionTreeClassifier":
        DecisionTreeClassifier(
            max_depth=p["max_depth"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
        ),
        "DecisionTreeRegressor":
        DecisionTreeRegressor(
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
        ),
        "RandomForestClassifier":
        RandomForestClassifier(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "RandomForestRegressor":
        RandomForestRegressor(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "ExtraTreesClassifier":
        ExtraTreesClassifier(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            bootstrap=True,
            oob_score=True,
        ),
        "ExtraTreesRegressor":
        ExtraTreesRegressor(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            bootstrap=True,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "GradientBoostingClassifier":
        GradientBoostingClassifier(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "GradientBoostingRegressor":
        GradientBoostingRegressor(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "HistGradientBoostingClassifier":
        GradientBoostingClassifier(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "HistGradientBoostingRegressor":
        GradientBoostingRegressor(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "MLPClassifier":
        MLPClassifier(
            hidden_layer_sizes=p["hidden_layer_sizes"],
            alpha=p["alpha"],
            random_state=random_state,
        ),
        "MLPRegressor":
        MLPRegressor(
            hidden_layer_sizes=p["hidden_layer_sizes"],
            alpha=p["alpha"],
            random_state=random_state,
        ),
        "GaussianNB":
        GaussianNB(),
        "LinearDiscriminantAnalysis":
        LinearDiscriminantAnalysis(),
        "QuadraticDiscriminantAnalysis":
        QuadraticDiscriminantAnalysis(),
        "KNeighborsClassifier":
        KNeighborsClassifier(n_neighbors=p["n_neighbors"],
                             weights=p["weights"],
                             n_jobs=n_jobs),
        "KNeighborsRegressor":
        KNeighborsRegressor(n_neighbors=p["n_neighbors"],
                            weights=p["weights"],
                            n_jobs=n_jobs),
    }

    # define classifier
    model = estimators[estimator]

    # classification or regression
    if (estimator == "LogisticRegression" or estimator == "SGDClassifier"
            or estimator == "MLPClassifier"
            or estimator == "DecisionTreeClassifier"
            or estimator == "RandomForestClassifier"
            or estimator == "ExtraTreesClassifier"
            or estimator == "GradientBoostingClassifier"
            or estimator == "HistGradientBoostingClassifier"
            or estimator == "GaussianNB"
            or estimator == "LinearDiscriminantAnalysis"
            or estimator == "QuadraticDiscriminantAnalysis"
            or estimator == "SVC" or estimator == "KNeighborsClassifier"):
        mode = "classification"
    else:
        mode = "regression"

    return (model, mode)
data_train = pd.concat(frames)
data_train.info()

bb=data_train.iloc[:, 4:6749]

cc = bb.apply(lambda x: x.fillna(x.mean()), axis=0)
cc['tag'] = data_train.iloc[:, 3:4]
test = dfp.iloc[:, 2:6744].apply(lambda x: x.fillna(x.mean()), axis=0)
Xtest = adalist
x_data_output = dfp.iloc[:, 0:1].values

predictors = adalist


alg = RandomForestClassifier(random_state=1, n_estimators=62, min_samples_split=2, min_samples_leaf=1)

kf = model_selection.KFold(n_splits=33, shuffle=False, random_state=1)
scores = model_selection.cross_val_score(alg, cc[predictors], cc['tag'], cv=kf)
print("scores.mean=", scores.mean())

File = open("data/prob_radomforest_features.txt", "w",encoding=u'utf-8', errors='ignore')
File.write("id"+",")
File.write("prob" + "\n")
classifier = alg.fit(cc[predictors], cc['tag'])
predictiontest = classifier.predict_proba(test)
for step in range(len(test)):
    File.write(str(x_data_output[step])+",")
    File.write(str(predictiontest[step]) + "\n")

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# define a dictionary for different classifiers and their parameters
classifiers = {
    "Dummy"        : DummyClassifier(strategy='uniform', random_state=2),
    "KNN(3)"       : KNeighborsClassifier(3), 
    "RBF SVM"      : SVC(gamma=2, C=1), 
    "Decision Tree": DecisionTreeClassifier(max_depth=7), 
    "Random Forest": RandomForestClassifier(max_depth=7, n_estimators=10, max_features=4), 
    "xgboost"      : XGBClassifier(),
    "Neural Net"   : MLPClassifier(alpha=1), 
    "AdaBoost"     : AdaBoostClassifier(),
    "Naive Bayes"  : GaussianNB(), 
    "QDA"          : QuadraticDiscriminantAnalysis(),
    "Linear SVC"   : LinearSVC(),
    "Linear SVM"   : SVC(kernel="linear"), 
    "Gaussian Proc": GaussianProcessClassifier(1.0 * RBF(1.0)),
}
from time import time
nfast = 10      # Run the first nfast learner. Don't run the very slow ones at the end
head = list(classifiers.items())[:nfast]

for name, classifier in head:
    start = time()                     # remember starting training time
def RandomForest(Number_leaves, X_train, y_train, X_test):
    clf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=Number_leaves,class_weight = "balanced") #define decision tree with selected variable
    clf = clf.fit(X_train,y_train) #train the decision tree on training set
    y_pred = clf.predict(X_test) #predict the values of X_test
    
    return(clf, y_pred)
Пример #25
0
test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\
                                                   != '',9].astype(np.float)))
#All the missing prices assume median of their respectice class
for i in xrange(np.size(test_data[0::, 0])):
    if test_data[i, 7] == '':
        test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\
                                             (test_data[0::,0] == test_data[i,0])\
            ,7].astype(np.float))

test_data = np.delete(test_data, [1, 6],
                      1)  #remove the name data, cabin and ticket

#The data is now ready to go. So lets train then test!

print 'Training '
forest = RandomForestClassifier(n_estimators=100)

forest = forest.fit(train_data[0::,1::],\
                    train_data[0::,0])

print 'Predicting'
output = forest.predict(test_data)

open_file_object = csv.writer(open("../csv/myfirstforest.csv", "wb"))
test_file_object = csv.reader(open('../csv/test.csv',
                                   'rb'))  #Load in the csv file

test_file_object.next()
i = 0
for row in test_file_object:
    row.insert(0, output[i].astype(np.uint8))
Пример #26
0
def build_model():
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])
    return pipeline
Пример #27
0
y = lbl.fit_transform(y)
y
#male convert to 1
#female convert to 0

#split the dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
acc_scores = []
roc_scores = []
clf = RandomForestClassifier(n_estimators=150)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)
y_pred = clf.predict(X_test)
acc_scores.append(accuracy_score(y_test, y_pred))
roc_scores.append(roc_auc_score(y_test, y_pred))
acc_scores[0], roc_scores[0]

import pickle
pickle.dump(clf, open('model.pkl', 'wb'))

# Loading model to compare the results
model = pickle.load(open('model.pkl', 'rb'))
print(
    model.predict([[
        0.077315503, 0.083829421, 0.036718459, 0.008701057, 0.131908017,


train_X = train[feature_list]
train_y = train['AdoptionSpeed']

random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

#skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

classifier = RandomForestClassifier()

param_search = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

param_search.fit(train_X, train_y)



fitted_classifier = classifier.fit(train_X, train_y)

fitted_classifier.oob_score_
cross_val_score(fitted_classifier, train_X, train_y, cv=5, scoring='f1_macro')



predictions = classifier.predict(train_y)
Пример #29
0
#filename='blood.csv'
#filename='2dplanes.csv'
filename = 'custom_satisfaction.csv'

ns = [1, 2, 4, 8, 16, 32]
for n in ns:
    data = pd.read_csv(filename)
    data = data.drop('ID', axis=1)
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    #f=int(math.log(X.shape[1]+1,2))

    # In[24]:

    start = time.time()
    clf = RandomForestClassifier(n_estimators=n, random_state=0)
    clf.fit(X, y)
    taken = time.time() - start
    print(taken)
    pd.DataFrame([[filename, taken, n]]).to_csv('rf_sk.txt',
                                                mode='a',
                                                index=False,
                                                header=False)

    # In[25]:

    start = time.time()
    clf = BaggingClassifier(n_estimators=n, random_state=0)
    clf.fit(X, y)
    taken = time.time() - start
    print(taken)
Пример #30
0
#plot.target(X)

y = X['Type']  #(target we want to predict)

X.drop('Type', axis=1, inplace=True)
#print(X.head())

X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      train_size=0.8,
                                                      test_size=0.2,
                                                      random_state=0)

###first model
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_valid)
print(confusion_matrix(y_valid, predictions))
print("Standard Random Forrest: ", model.score(X_valid, y_valid))

###model with paramet optimization

#### n_estimators

ns = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
scores = []
for n in ns:
    model = RandomForestClassifier(n_estimators=n)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)