예제 #1
0
def test_predictproba_hardvoting():
    eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
                                        ('lr2', LogisticRegression())],
                            voting='hard')
    msg = "predict_proba is not available when voting='hard'"
    assert_raise_message(AttributeError, msg, eclf.predict_proba, X)
예제 #2
0
    })).tolist()))
newArr = []

for row in s:
    t = row.split(',')
    t = np.array(t)
    t = t.astype(float)
    newArr.append(t)

accelArr = np.array(newArr)
actionArr = np.array(dfs['Action'])

# Create classifiers
knn = KNeighborsClassifier(n_neighbors=3)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
eclf1 = VotingClassifier(estimators=[('rf', clf2), ('knn', knn)],
                         voting='hard')
eclf1 = eclf1.fit(accelArr, actionArr)


def detection_callback(device, advertisement_data):
    """Asynch Callback

    Args:
        device : Bleak device object
        advertisement_data : Advertisement Data Read
    """

    global aidenBool
    global georgeBool
    try:
        if (device.address == aidenThingy or device.address
예제 #3
0
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dist,
                                   cv=3,
                                   n_iter=10,
                                   verbose=10)
random_search.fit(X_to_file, y_to_file[0])
print(random_search.best_params_)

clf4 = SVC(**random_search.best_params_, probability=True)
scores = cross_val_score(clf4, X_to_file, y, cv=9)
print(scores)
print(np.mean(scores))

from sklearn.ensemble import VotingClassifier
eclf = VotingClassifier(estimators=[('rf', clf1), ('kn', clf2), ('nb', clf3),
                                    ('svm', clf4)],
                        voting='soft')

scores = cross_val_score(eclf, X_to_file, y, cv=9)
print(scores)
print(np.mean(scores))
'''
clf_list = [
        RandomForestClassifier(n_estimators = 100, min_samples_leaf=2, min_samples_split=6),
            
        #SVC(kernel='rbf', degree=2, gamma='auto'),

        KNeighborsClassifier(n_neighbors=10, p=4),

        #GaussianNB(),
        MultinomialNB(),
예제 #4
0
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    Nystroem(gamma=10.0, kernel="polynomial", n_components=10),
    make_union(
        VotingClassifier([("est",
                           KNeighborsClassifier(n_neighbors=4,
                                                weights="distance"))]),
        FunctionTransformer(lambda X: X)),
    make_union(
        VotingClassifier([("est",
                           ExtraTreesClassifier(criterion="entropy",
                                                max_features=1.0,
                                                n_estimators=500))]),
        FunctionTransformer(lambda X: X)),
    FeatureAgglomeration(affinity="precomputed", linkage="average"),
    GaussianNB())

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
예제 #5
0
#         2. 소프트 보팅: 확률
# 2. bagging
#     여러개의 데이터셋 + 같은 ML
# 3. boosting
#     여러번 학습과 예측
#     첫번째 학습과 예측에서 잘못된 것을 두번째 학습에서 가중치 이용해서 수정해서 재학습/예측 반복

cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
print(df)
# 개별 모델
lc_r = LogisticRegression(max_iter=10000)  # iter -> 최대 반복 횟수
knn_clf = KNeighborsClassifier(n_neighbors=4)

# 개별 모델을 묶어 주는 보팅
vo_clf = VotingClassifier(estimators=[("LR", lc_r), ("KNN", knn_clf)], voting="soft")

# 학습 데이터와 테스트 데이터로 분리
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=11)

# 학습과 예측
vo_clf.fit(x_train, y_train)
prediction = vo_clf.predict(x_test)

# 정확도
print("VotingClassifier 정확도:", accuracy_score(y_test, prediction))

# 개별 ML 정확도
models = [lc_r, knn_clf, vo_clf]
for m in models:
    m.fit(x_train, y_train)
예제 #6
0
    clf.fit(X_train, y_train)

    # Predict y_pred
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_pred, y_test)

    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)

# Fit vc to the training set
vc.fit(X_train, y_train)

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_pred, y_test)
print('Voting Classifier: {:.3f}'.format(
    accuracy))  # Better accuracy than the three other classifiers

# After Vorting Classifier, we have a look at bagging classifiers and regression

# Import DecisionTreeClassifier
예제 #7
0
    if acc_score > best_score:
        best_score = acc_score
        best_model = bag

best_model
pred = best_model.predict(X_test)
accuracy_score(y_test, pred)

# svc, random forest, and logistic regression in a voting classifier

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier(random_state=181)
svm_clf = SVC(kernel='linear', random_state=181)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                          ('svc', svm_clf)],
                              voting='hard')

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, pred))

# CV in the voting classifier (OBS takes approx. 20 min)

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier(random_state=181)
svm_clf = SVC(kernel='linear', probability=True, random_state=181)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                          ('svc', svm_clf)],
예제 #8
0
    Features_10_folds.append(features)
    Labels_10_folds.append(labels)

#for last fold all remaining
features = data[data.columns[1:44]]
labels = data[data.columns[44]]
Features_10_folds.append(features)
Labels_10_folds.append(labels)

print Features_10_folds[0].shape

clf = RandomForestClassifier(max_depth=8, random_state=0)
mlp = MLPClassifier(hidden_layer_sizes=(50, 25))
gbt = GradientBoostingClassifier()
ovr = OneVsRestClassifier(RandomForestClassifier())
eclf = VotingClassifier(estimators=[('gbt', gbt), ('ovr', ovr)], voting='soft')
acc_RF = []
prec_RF = []
rec_RF = []
f1_RF = []
acc_MLP = []
prec_MLP = []
rec_MLP = []
f1_MLP = []
acc_GBT = []
prec_GBT = []
rec_GBT = []
f1_GBT = []
acc_ovr = []
prec_ovr = []
rec_ovr = []
def test_estimator_html_repr_pipeline():
    num_trans = Pipeline(
        steps=[("pass",
                "passthrough"), ("imputer", SimpleImputer(strategy="median"))])

    cat_trans = Pipeline(steps=[
        ("imputer",
         SimpleImputer(strategy="constant", missing_values="empty")),
        ("one-hot", OneHotEncoder(drop="first")),
    ])

    preprocess = ColumnTransformer([
        ("num", num_trans, ["a", "b", "c", "d", "e"]),
        ("cat", cat_trans, [0, 1, 2, 3]),
    ])

    feat_u = FeatureUnion([
        ("pca", PCA(n_components=1)),
        (
            "tsvd",
            Pipeline([
                ("first", TruncatedSVD(n_components=3)),
                ("select", SelectPercentile()),
            ]),
        ),
    ])

    clf = VotingClassifier([
        ("lr", LogisticRegression(solver="lbfgs", random_state=1)),
        ("mlp", MLPClassifier(alpha=0.001)),
    ])

    pipe = Pipeline([("preprocessor", preprocess), ("feat_u", feat_u),
                     ("classifier", clf)])
    html_output = estimator_html_repr(pipe)

    # top level estimators show estimator with changes
    assert html.escape(str(pipe)) in html_output
    for _, est in pipe.steps:
        assert ('<div class="sk-toggleable__content"><pre>' +
                html.escape(str(est))) in html_output

    # low level estimators do not show changes
    with config_context(print_changed_only=True):
        assert html.escape(str(num_trans["pass"])) in html_output
        assert "passthrough</label>" in html_output
        assert html.escape(str(num_trans["imputer"])) in html_output

        for _, _, cols in preprocess.transformers:
            assert f"<pre>{html.escape(str(cols))}</pre>" in html_output

        # feature union
        for name, _ in feat_u.transformer_list:
            assert f"<label>{html.escape(name)}</label>" in html_output

        pca = feat_u.transformer_list[0][1]
        assert f"<pre>{html.escape(str(pca))}</pre>" in html_output

        tsvd = feat_u.transformer_list[1][1]
        first = tsvd["first"]
        select = tsvd["select"]
        assert f"<pre>{html.escape(str(first))}</pre>" in html_output
        assert f"<pre>{html.escape(str(select))}</pre>" in html_output

        # voting classifier
        for name, est in clf.estimators:
            assert f"<label>{html.escape(name)}</label>" in html_output
            assert f"<pre>{html.escape(str(est))}</pre>" in html_output
예제 #10
0
# 决策树
dt_clf = DecisionTreeClassifier(random_state=666)
dt_clf.fit(X_train, y_train)
dt_score = dt_clf.score(X_test, y_test)
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int')
score1 = accuracy_score(y_test, y_predict)
print(score1)
# 使用Voting Classifier  少数服从多数
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('log_clf', LogisticRegression()), ('svm_clf', SVC()),
    ('dt_clf', DecisionTreeClassifier(random_state=666))
],
                              voting='hard')
voting_clf.fit(X_train, y_train)
score2 = voting_clf.score(X_test, y_test)
print(score2)
# 更合理的投票,应该有权值
# Hard Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('log_clf', LogisticRegression()), ('svm_clf', SVC()),
    ('dt_clf', DecisionTreeClassifier(random_state=666))
예제 #11
0
# 1.0

# <h4> Voting Classifier </h4>


from sklearn.linear_model import LogisticRegression#importing logistc regression
from sklearn.svm import SVC#importing Svm 

estimators = []
log_reg = LogisticRegression(solver='liblinear')
estimators.append(('Logistic', log_reg))

tree = DecisionTreeClassifier()
estimators.append(('Tree', tree))

svm_clf = SVC(gamma='scale')
estimators.append(('svm', svm_clf))

voting = VotingClassifier(estimators=estimators)
voting.fit(x_train, y_train)


voting.fit(x_train,y_train)

voting.score(x_test,y_test)
# 0.8051948051948052


voting.score(x_train,y_train)
#0.8110749185667753
예제 #12
0
df['carrier'] = pd.factorize(df['carrier'])[0]
df['dest'] = pd.factorize(df['dest'])[0]
test_x = enc.transform(df)

print train_x.shape

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                        voting='hard')

eclf.fit(train_x.toarray(), train_y)

# Evaluate on test set
pr = eclf.predict(test_x.toarray())

# print results
cm = confusion_matrix(test_y, pr)
print "<-------  VotingClassifier -------->"
print "Confusion matrix:"
print pd.DataFrame(cm)
report_svm = precision_recall_fscore_support(list(test_y),
                                             list(pr),
                                             average='binary')
print "\n[-] Precision = %0.2f\n[-] Recall = %0.2f\n[-] F1 score = %0.2f\n[-] Accuracy = %0.2f" % \
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# -------------------------------------------------------------------------------- #
print_dividing_line('VotingClassifier')

from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
from sklearn.svm import SVC
svc_clf = SVC()
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()

from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier( estimators=[("log", log_clf), ("svc", svc_clf), ("rf", rf_clf)], voting="hard" ) # soft

from sklearn.metrics import accuracy_score
for clf in ( log_clf, svc_clf, rf_clf, voting_clf ):
  clf.fit( X_train, y_train )
  y_pred = clf.predict( X_test )
  print( clf.__class__.__name__, accuracy_score(y_test, y_pred) )

# -------------------------------------------------------------------------------- #
print_dividing_line('BaggingClassifier')

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1 )

from sklearn.metrics import accuracy_score
         tprRF,
         '',
         label="SuperStrength with Random Forest, auc= %0.2f" % aucRF)
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive')
plt.ylabel('True Positive')
plt.legend(loc=4)
plt.show()
'''
####################################### VOTING_CLASSIFIER  ########################################
'''
from sklearn.ensemble import VotingClassifier

votingClf = VotingClassifier(estimators=[('tr', classificadorTREE),
                                         ('rf', classificadorRF),
                                         ('nb', classificadorNB)],
                             voting='soft',
                             weights=[1.1, 2, 1])

for clf, label in zip(
    [classificadorTREE, classificadorRF, classificadorNB, votingClf],
    ['Decision Tree', 'Random Forest', 'Naive Bayes', 'Ensemble']):

    scores = cross_val_score(clf, previsores, classe, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f [%s]" % (scores.mean(), label))
'''
#################################################################################################
############################################ ENSEMBLE ###########################################
#################################################################################################
'''
예제 #15
0
     "slotprice",
     "creative",
     "keypage",
     "advertiser",
     "usertag",
 ]
 """
 estimators = []
 num_estimators = 11
 models_already_created = False
 do_LR = True
 if models_already_created:
     for i in range(num_estimators):
         model = load("svm" + str(i))
         estimators.append(("svm" + str(i), model))
     v = VotingClassifier(estimators, n_jobs=-1)
     voting_bidder = Bidder(("voting", v), None)
     voting_bidder.train()
     voting_bidder.test()
 elif do_LR:
     """
     kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
     clf3 = GaussianProcessClassifier(
         kernel=kernel,
         max_iter_predict=100,
         multi_class="one_vs_one",
         n_jobs=-1,
         n_restarts_optimizer=5,
     )
     b5 = Bidder(("gp", clf3), None)
     model5 = b5.train()
        Xtr = scaler.fit_transform(Xtr)
        Xte = scaler.transform(Xte)
        for name, clf in Classifiers:
            try:
                clone_clf = clone(clf)
                clone_clf.fit(Xtr,ytr)
                y_pred = clone_clf.predict(Xte)
                df_sim[name] = [score(yte,y_pred)]
            except:
                print("Classifier %s failed to process dataset %s" % (name,Name))
        df = pd.concat([df,df_sim])
    df.to_csv("CSVs/%s.csv" % Name)
    return df

VotingSVC = VotingClassifier([("RBF SVC",SVC(gamma="scale")),
             ("Linear SVC",SVC(kernel="linear")),
             ("Poly SVC",SVC(kernel="poly"))])
BaggingSVC = BaggingClassifier(base_estimator=SVC(gamma="scale"),n_estimators=10, random_state=0)
Classifiers = [("Linear SVC",SVC(kernel="linear",gamma="scale")), 
               ("RBF SVC",SVC(gamma="scale")),
               ("Poly SVC",SVC(kernel="poly",gamma="scale")),
               ("SVC Ensemble",VotingSVC),
               ("Bagging SVC",BaggingSVC),
               ("DEP",DEP()),
               ("r-DEP (Ensemble)",make_pipeline(EnsembleTransform(VotingSVC),StandardScaler(),DEP())),
               ("r-DEP (Bagging)",make_pipeline(EnsembleTransform(BaggingSVC),StandardScaler(),DEP())),
              ]

AllDataSets = [
    ("Breast Cancer Wisconsin","wdbc",1),
    ("Diabetes","diabetes",1),
예제 #17
0
m3 = RandomForestClassifier(n_estimators=80)
models.append(('r_forest', m3))
m4 = RandomForestClassifier(n_estimators=90)
models.append(('r_forest', m4))
m5 = KNeighborsClassifier(n_neighbors=1)
models.append(('knn', m5))
m6 = KNeighborsClassifier(n_neighbors=2)
models.append(('knn', m6))
m7 = KNeighborsClassifier(n_neighbors=3)
models.append(('knn', m7))
m8 = KNeighborsClassifier(n_neighbors=4)
models.append(('knn', m8))
m9 = KNeighborsClassifier(n_neighbors=5)
models.append(('knn', m9))
# create voting ensemble
e = VotingClassifier(models, weights=[0.8, 0.9, 1, 1.1, 1.1, 1.1, 1, 0.9, 0.8])
e.fit(train_X.values, np.ravel(train_Y.values))
preds = e.predict(test_X.values)
print(accuracy_score(np.ravel(test_Y.values), preds))
'''
df_pred = pd.DataFrame(preds, columns=['coverType_1to7'])
df_pred.insert(loc=0, column='id', value=np.ravel(df_test_X_ids.values))
print(df_pred[:10])
df_pred.to_csv('voting_4forst_5knn_weights.csv', index=False)
'''
# get cv result
# result = model_selection.cross_val_score(e, df_X.values, np.ravel(df_Y.values)
#    , cv=k)
# print(result.mean())

# predictions = clf.predict(df_test_X.values[:20000])
print("After Standardization\nMean ", np.mean(X_train), "Standard Deviation ",
      np.std(X_train), "\n")

#Voting ensemble method. Combining all tree based algorithms.
models = []
models.append(("XGB", XGBClassifier()))
models.append(("RF", RandomForestClassifier()))
models.append(("DT", DecisionTreeClassifier()))
models.append(("ADB", AdaBoostClassifier()))
models.append(("GB", GradientBoostingClassifier()))

#############################################################################
# test and train the upsampled data against classifiers
# to find the optimum prediction
#############################################################################
ensemble = VotingClassifier(estimators=models)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
print(classification_report(y_pred, y_test))
print("Voting Ensemble:>", accuracy_score(y_pred, y_test))

SVM = SVC(kernel="linear", class_weight="balanced", probability=True)
SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)
print(classification_report(y_pred, y_test))
print("SVM: ", accuracy_score(y_pred, y_test))

XGBC = XGBClassifier(learning_rate=0.1,
                     n_estimators=10000,
                     max_depth=4,
                     min_child_weight=6,
예제 #19
0
def perform():
    try:
      filename = session['filename']
      data = pd.read_csv(filename, header=0)








      array = data.values
      X = array[:,0:8]
      y = array[:,8]
      X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.33,random_state=1)
 
 
# Spot Check Algorithms
      models = []
      models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
      #models.append(('LDA', LinearDiscriminantAnalysis()))
      models.append(('KNN', KNeighborsClassifier()))
      models.append(('CART', DecisionTreeClassifier(criterion="entropy")))
      models.append(('NB', GaussianNB()))
      #models.append(('SVM', SVC(gamma='auto')))
      #models.append(('ANN', MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500, random_state=42)))
 
      models.append(('RF', RandomForestClassifier()))
 
      models.append(('BG',BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features =1.0, n_estimators =20)))

      models.append(('ADA',AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1)))

      LR=LogisticRegression(solver='liblinear', multi_class='ovr')
      LDA=LinearDiscriminantAnalysis()
      KNN=KNeighborsClassifier()
      CART=DecisionTreeClassifier(criterion="entropy")
      RF=RandomForestClassifier()
      NB=GaussianNB()
      SVM=SVC(gamma='auto')
      ANN=MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500, random_state=42)
      BG=BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators =20)
      ADA=AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1)
 
      models.append(('VOTE',VotingClassifier( estimators= [('RF',RF),('BG',BG)], voting = 'hard')))

# evaluate each model in turn
      results = []
      names = []
      for name, model in models:
  	    kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
  	    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
  	    results.append(cv_results)
  	    names.append(name)
  	  #print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))


      


# Compare Algorithms
      plt.boxplot(results, labels=names)
      plt.title('Algorithm Comparison')
      plt.plot()
      session['strFile']="./static/images/perf.png"

      strFile = "./static/images/perf.png"

      if os.path.isfile(strFile):
        os.remove(strFile)
      plt.savefig(strFile)
      plt.close()

    except KeyError:
      flash('Dataset not uploaded!')

    #plt.savefig('/home/saurabh/Desktop/DPS/static/images/perf.png')
 
    
    return render_template('dashboard.html', name = 'Plot Showing Accuracy of Different Algorithms:', url ='/static/images/perf.png')
예제 #20
0
for clf, label in zip([rfc, gbc, lgr], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring="roc_auc")
    print("ROC AUC: %0.2F (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))

print(
    "------------------------VotingClassifier集成模型-------------------------------"
)
ensemble_clf = VotingClassifier(estimators=[
    ('RandomForestClassifier', rfc), ('GradientBoostingClassifier', gbc),
    ('LogisticRegression', lgr)
],
                                voting='soft',
                                weights=[1, 1, 1],
                                flatten_transform=True)

ensemble_clf.fit(X_train, y_train)
preds = ensemble_clf.predict(X_test)
print("VotingClassifier ROC AUC:%.3f" %
      roc_auc_score(y_true=y_test, y_score=preds))
print("VotingClassifier accuracy_scorer:%.3f" %
      accuracy_score(y_true=y_test, y_pred=preds))

clf_labels = [
    "RandomForestClassifier", "GradientBoostingClassifier",
    "LogisticRegression", "VotingClassifier"
]
예제 #21
0
class HogFaceClassifier:

    svc_pipeline = Pipeline([
        # ('preprocess', FunctionTransformer(get_face_hog)),
        ('classifier',
         SVC(C=10,
             kernel='poly',
             gamma=1,
             shrinking=False,
             class_weight='balanced',
             probability=True,
             tol=0.001,
             cache_size=10000,
             max_iter=-1,
             verbose=0))
    ])

    et_pipeline = Pipeline([
        # ('preprocess', FunctionTransformer(get_face_hog)),
        ('classifier',
         ExtraTreesClassifier(n_estimators=10000,
                              criterion='entropy',
                              max_features=0.2,
                              verbose=0,
                              n_jobs=2))
    ])

    ens = VotingClassifier(estimators=[
        ('svc', svc_pipeline),
        ('et', et_pipeline),
    ],
                           voting='soft',
                           weights=[10, 1],
                           n_jobs=2)

    def __init__(self,
                 binary_classification: bool = False,
                 params: dict = None):
        self.binary_classification = binary_classification
        #self.ens = self.et_pipeline
        print(str(self.ens.get_params().keys()))
        if params is not None:
            self.ens.set_params(**params)

    def fit(self, x, y):
        self.ens.fit(x, y)

    def save(self, file):
        joblib.dump(self.ens, file)

    def load(self, file):
        self.ens = joblib.load(file)

    def cv_test(self, x, y):
        score = cross_val_score(self.ens, x, y, cv=3, verbose=3, n_jobs=3)
        return score.mean()

    def prediction(self, X):
        return self.ens.predict(X)

    def evaluate(self, x_test, y_test) -> ClassificationResults:
        preds = self.prediction(x_test)
        pred_probs = self.ens.predict_proba(x_test)
        acc = accuracy_score(from_hot_one(y_test), preds)
        results = ClassificationResults(labels=y_test,
                                        preds=preds,
                                        pred_probs=pred_probs,
                                        acc=acc,
                                        binary=self.binary_classification)
        return results
    def file_output(
        self,
        Y_optimization_pred: np.ndarray,
        Y_valid_pred: np.ndarray,
        Y_test_pred: np.ndarray,
    ) -> Tuple[Optional[float], Dict[str, Union[str, int, float, List, Dict,
                                                Tuple]]]:
        # Abort if self.Y_optimization is None
        # self.Y_optimization can be None if we use partial-cv, then,
        # obviously no output should be saved.
        if self.Y_optimization is None:
            return None, {}

        # Abort in case of shape misalignment
        if np.shape(self.Y_optimization)[0] != Y_optimization_pred.shape[0]:
            return (
                1.0,
                {
                    'error':
                    "Targets %s and prediction %s don't have "
                    "the same length. Probably training didn't "
                    "finish" %
                    (np.shape(self.Y_optimization), Y_optimization_pred.shape)
                },
            )

        # Abort if predictions contain NaNs
        for y, s in [
                # Y_train_pred deleted here. Fix unittest accordingly.
            [Y_optimization_pred, 'optimization'],
            [Y_valid_pred, 'validation'],
            [Y_test_pred, 'test']
        ]:
            if y is not None and not np.all(np.isfinite(y)):
                return (
                    1.0,
                    {
                        'error':
                        'Model predictions for %s set contains NaNs.' % s
                    },
                )

        # Abort if we don't want to output anything.
        # Since disable_file_output can also be a list, we have to explicitly
        # compare it with True.
        if self.disable_file_output is True:
            return None, {}

        # Notice that disable_file_output==False and disable_file_output==[]
        # means the same thing here.
        if self.disable_file_output is False:
            self.disable_file_output = []

        # Here onwards, the self.disable_file_output can be treated as a list
        self.disable_file_output = cast(List, self.disable_file_output)

        # This file can be written independently of the others down bellow
        if ('y_optimization' not in self.disable_file_output):
            if self.output_y_hat_optimization:
                self.backend.save_targets_ensemble(self.Y_optimization)

        models: Optional[BaseEstimator] = None
        if hasattr(self, 'models'):
            if len(self.models) > 0 and self.models[
                    0] is not None:  # type: ignore[attr-defined]
                if ('models' not in self.disable_file_output):

                    if self.task_type in CLASSIFICATION_TASKS:
                        models = VotingClassifier(
                            estimators=None,
                            voting='soft',
                        )
                    else:
                        models = VotingRegressor(estimators=None)
                    # Mypy cannot understand hasattr yet
                    models.estimators_ = self.models  # type: ignore[attr-defined]

        self.backend.save_numrun_to_dir(
            seed=self.seed,
            idx=self.num_run,
            budget=self.budget,
            model=self.model
            if 'model' not in self.disable_file_output else None,
            cv_model=models
            if 'cv_model' not in self.disable_file_output else None,
            ensemble_predictions=(Y_optimization_pred if 'y_optimization'
                                  not in self.disable_file_output else None),
            valid_predictions=(Y_valid_pred if 'y_valid'
                               not in self.disable_file_output else None),
            test_predictions=(Y_test_pred if 'y_test'
                              not in self.disable_file_output else None),
        )

        return None, {}
예제 #23
0
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier


# In[42]:

clf1=LogisticRegression(random_state=101)
clf2=RandomForestClassifier(random_state=101)
clf3=GaussianNB()


# In[43]:

X=df.drop(['Observed Attendance'],axis=1)
y=df['Observed Attendance']
eclf1=VotingClassifier(estimators=[('lr',clf1),('rf',clf2),('gnb',clf3)],weights=(1,2,3),voting='hard')
eclf1=eclf1.fit(X_train,y_train)
eclf1=eclf1.fit(X,y)
print(eclf1.predict(X))


# In[44]:

eclf2=VotingClassifier(estimators=[('lr',clf1),('rf',clf2),('gnb',clf3)],weights=(1,2,3),voting='hard')
eclf2=eclf2.fit(X_train,y_train)
predict=eclf2.predict(X_test)
print(classification_report(y_test,predict))


# In[45]:
예제 #24
0
    print('Sensitivity_std:%f' % r_s)
    print('Specificity_std:%f' % s_s)
    print('f1_std:%f' % f_s)


if __name__ == '__main__':
    # clf_lr = MLPClassifier(activation='relu',alpha=0.005)
    # clf_lr.fit(data.iloc[:, :-1], data.iloc[:, -1])
    clf_lr = LogisticRegression(C=0.07)
    ada_lr = AdaBoostClassifier(clf_lr, n_estimators=20)
    clf_svm = SVC(gamma=30, probability=True)
    ada_svm = AdaBoostClassifier(clf_svm, n_estimators=20, algorithm="SAMME")
    clf_nb = MultinomialNB(alpha=10)
    ada_nb = AdaBoostClassifier(clf_nb, n_estimators=20)
    #	clf_dt=DecisionTreeClassifier()
    #	ada_dt=AdaBoostClassifier(clf_dt,n_estimators=20, learning_rate=0.5)
    voting_clf = VotingClassifier(estimators=[("ada_lr", ada_lr),
                                              ("ada_svm", ada_svm),
                                              ("ada_nb", ada_nb)],
                                  voting='soft')  #,weights=[1.2,1,1]

    for clf in [("lr", clf_lr), ("svm", clf_svm), ("NB", clf_nb),
                ('ensemble', voting_clf)]:
        print(clf[0])
        result(data, clf[1])

    for clf in [("lr", clf_lr), ("svm", clf_svm), ("NB", clf_nb),
                ('ensemble', voting_clf)]:
        print(clf[0])
        std(data, clf[1])
예제 #25
0
    print('************Stats of '+col+'****************')
    print(encoded_ds[col].describe())
    plt.hist(encoded_ds[col])
    plt.show()   
    print('************End of stats for '+col+'*************')
    print('\n')

 
    
    
#build X and y
X = encoded_ds.iloc[:,0:-1] #all columns except last column
y = integer_encoded_churn 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)    
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_rf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=100, max_leaf_nodes=100)
clf_lr = LogisticRegression(class_weight="balanced")
estimators = [('knn', clf_knn), ('lr', clf_lr), ('dt', clf_rf)]
clf_avg = VotingClassifier(estimators,voting='soft')
clf_avg.fit(X_train,y_train)
print(accuracy_score(y_test, clf_avg.predict(X_test)))
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
print(accuracy_score(y_true, y_pred))


plt.figure(figsize=(12,10))
cor = encoded_ds.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
예제 #26
0
# The prediction seems to be quite similar for the 5 classifiers except when Adaboost is compared to the others classifiers.
# 
# The 5 classifiers give more or less the same prediction but there is some differences. Theses differences between the 5 classifier predictions are sufficient to consider an ensembling vote. 

# ### 6.2 Ensemble modeling
# #### 6.2.1 Combining models
# 
# I choosed a voting classifier to combine the predictions coming from the 5 classifiers.
# 
# I preferred to pass the argument "soft" to the voting parameter to take into account the probability of each vote.

# In[75]:


votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
('svc', SVMC_best), ('adac',ada_best),('gbc',GBC_best)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, Y_train)


# ### 6.3 Prediction
# #### 6.3.1 Predict and Submit results

# In[76]:


test_Survived = pd.Series(votingC.predict(test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)
예제 #27
0
# k fold cross validation
kfolds = KFold(n_splits=10, random_state=0)

# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

estimators

ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X_train, y_train, cv=kfolds)
print(results.mean())

modelfit = ensemble.fit(X_train, y_train)

y_pred = modelfit.predict(X_test)
y_pred

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

accuracy_score(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)
cm
예제 #28
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        make_union(VotingClassifier([('branch',
            AdaBoostClassifier(learning_rate=1.0, n_estimators=500)
        )]), FunctionTransformer(lambda X: X)),
        FunctionTransformer(lambda X: X)
    ),
    Nystroem(gamma=8.0, kernel="polynomial", n_components=27),
    LogisticRegression(C=0.0001, dual=False, penalty="l1")
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
예제 #29
0
print("-------------------------------------------------")

#define a decision tree model using entropy based information gain

#decTreeModel2 = tree.DecisionTreeClassifier(criterion='entropy')
#decTreeModel2 = AdaBoostClassifier()
#decTreeModel2 = GaussianNB()
#decTreeModel2 = GradientBoostingClassifier()
#decTreeModel2 = BaggingClassifier()

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GradientBoostingClassifier()
clf4 = SVC()
decTreeModel2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                             ('gnb', clf3), ('bc', clf4)],
                                 voting='hard')

#decTreeModel2 = LogisticRegression(random_state=1)
#decTreeModel2 = LogisticRegression(random_state=1)

#train_dfs = preprocessing.normalize(train_dfs)
#Split the data: 60% training : 40% test set
instances_train, instances_test, target_train, target_test = cross_validation.train_test_split(
    train_dfs, targetLabels, test_size=0.4, random_state=0)
#fit the model using just the test set
decTreeModel2.fit(instances_train, target_train)
#Use the model to make predictions for the test set queries
predictions = decTreeModel2.predict(instances_test)
#Output the accuracy score of the model on the test set
print("Accuracy= " +
예제 #30
0
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(eclf1.transform(X),
                              eclf2.transform(X))
    assert_array_almost_equal(
            eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
            eclf2.transform(X)
    )


@pytest.mark.filterwarnings('ignore: Default solver will be changed')  # 0.22
@pytest.mark.filterwarnings('ignore: Default multi_class will')  # 0.22
@pytest.mark.parametrize(
    "X, y, voter",
    [(X, y, VotingClassifier(
        [('lr', LogisticRegression()),
         ('rf', RandomForestClassifier(n_estimators=5))])),
     (X_r, y_r, VotingRegressor(
         [('lr', LinearRegression()),
          ('rf', RandomForestRegressor(n_estimators=5))]))]
)
@pytest.mark.parametrize("drop", [None, 'drop'])
def test_none_estimator_with_weights(X, y, voter, drop):
    # check that an estimator can be set to None and passing some weight
    # regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/13777
    voter.fit(X, y, sample_weight=np.ones(y.shape))
    voter.set_params(lr=drop)
    voter.fit(X, y, sample_weight=np.ones(y.shape))
    y_pred = voter.predict(X)
    assert y_pred.shape == y.shape