Пример #1
0
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(
        scale(X_iris), y_iris, stratify=y_iris, random_state=42
    )
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf = StackingClassifier(
        estimators=estimators, final_estimator=final_estimator, cv=cv,
        passthrough=passthrough
    )
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    expected_column_count = 10 if passthrough else 6
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])

    clf.set_params(lr='drop')
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    expected_column_count_drop = 7 if passthrough else 3
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])
Пример #2
0
def test_stacking_classifier_iris(cv, final_estimator):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(scale(X_iris),
                                                        y_iris,
                                                        stratify=y_iris,
                                                        random_state=42)
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf = StackingClassifier(estimators=estimators,
                             final_estimator=final_estimator,
                             cv=cv)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 6

    clf.set_params(lr='drop')
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 3
Пример #3
0
def run():
        import numpy as np
        import pandas as pd
        import seaborn
        import matplotlib.pyplot as pyplot
        import seaborn as sns
        from sklearn.model_selection import train_test_split
        from xgboost import XGBClassifier
        from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
        from sklearn.svm import SVC
        svc=SVC(probability=True, kernel='linear')
        from sklearn.ensemble import GradientBoostingClassifier
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import StackingClassifier

        df = pd.read_table("./data/australian.csv", sep='\s+', header=None)
        y = df[14]
        X = df.drop(columns = 14)
        y.value_counts()
        # Split features and target into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.4)
        
        # Instantiate the Classifiers
        
        estimators = [('xgb', XGBClassifier()), ('gbdt', GradientBoostingClassifier(random_state=1))]
        
        clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
        
        clf.fit(X_train, y_train)
        # Make predictions for the test set
        y_pred_test = clf.predict(X_test)


        # View accuracy score
        
        print(classification_report(y_test, y_pred_test))

        clf_probs = clf.predict_proba(X_test)
        # keep probabilities for the positive outcome only
        clf_probs = clf_probs[:, 1]
        # calculate scores
        clf_auc = roc_auc_score(y_test, clf_probs)
        # summarize scores
        print('ensemble: ROC AUC=%.3f' % (clf_auc))
        print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True)))
        # calculate roc curves
        clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs)
        # plot the roc curve for the model
        pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Ensemble')
        # axis labels
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        # show the legend
        pyplot.legend()
        # show the plot
        pyplot.show()
Пример #4
0
def week10(C, random_state, criterion, min_samples_leaf, max_leaf_samples,
           n_estimators, solver, cv, clazz, images):
    trainData, Y = catsvsdogs.train
    Y = [(y + 1) % 2 for y in Y]

    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier

    tree = DecisionTreeClassifier(
        criterion=criterion,  # критерий разделения
        min_samples_leaf=min_samples_leaf,  # минимальное число объектов в листе
        max_leaf_nodes=max_leaf_samples,  # максимальное число листьев
        random_state=random_state)
    bagging = BaggingClassifier(
        tree,  # базовый алгоритм
        n_estimators=n_estimators,  # количество деревьев
        random_state=random_state)
    bagging.fit(trainData, Y)

    from sklearn.svm import LinearSVC

    svm = LinearSVC(random_state=random_state, C=C)
    svm.fit(trainData, Y)

    from sklearn.ensemble import RandomForestClassifier

    forest = RandomForestClassifier(
        n_estimators=n_estimators,  # количество деревьев
        criterion=criterion,  # критерий разделения
        min_samples_leaf=min_samples_leaf,  # минимальное число объектов в листе
        max_leaf_nodes=max_leaf_samples,  # максимальное число листьев
        random_state=random_state)
    forest.fit(trainData, Y)

    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression(solver=solver, random_state=random_state)

    from sklearn.ensemble import StackingClassifier

    base_estimators = [('SVM', svm), ('Bagging DT', bagging),
                       ('DecisionForest', forest)]
    sclf = StackingClassifier(estimators=base_estimators,
                              final_estimator=lr,
                              cv=cv)
    sclf.fit(trainData, Y)

    accuracy = sclf.score(trainData, Y)

    probas = []
    for img in images:
        histt = catsvsdogs.test[img].reshape(1, -1)
        probas += [(img, sclf.predict_proba(histt)[0][clazz])]

    return {'accuracy': accuracy, 'probas': probas}
Пример #5
0
def test_stacking_classifier_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_iris),
                                                   y_iris,
                                                   stratify=y_iris,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))],
                             final_estimator=rf,
                             cv=5)
    clf_drop = StackingClassifier(estimators=estimators,
                                  final_estimator=rf,
                                  cv=5)

    clf.fit(X_train, y_train)
    clf_drop.fit(X_train, y_train)
    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def Model_1(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = OneHotEncoder(sparse=False).fit_transform(
        [[x for x in s] for s in train['Sequence']])
    X_test = OneHotEncoder(sparse=False).fit_transform(
        [[x for x in s] for s in test['Sequence']])
    Y_train = train['label']

    X_train, Y_train = RandomUnderSampler(random_state=100).fit_resample(
        X_train, Y_train)
    X_train, Y_train = shuffle(X_train, Y_train, random_state=100)

    # Training
    estimators = [('rf',
                   RandomForestClassifier(n_estimators=300,
                                          max_depth=45,
                                          min_samples_leaf=7,
                                          random_state=100)),
                  ('mlp', MLPClassifier(max_iter=200, random_state=100)),
                  ('knn', KNeighborsClassifier(n_neighbors=4))]

    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(random_state=100),
        n_jobs=-1,
        verbose=1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_pred = clf.predict(X_test)
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_1.csv", index=False)
    result["Label"] = Y_pred
    result.to_csv("Predictions_1.csv", index=False)
Пример #7
0
def ensemble_predictions(members, X_te, params):
    assert params["type"] in ("weighted", "stacked")
    # make predictions
    if params["type"] == "weighted":
        y_preds = np.array([model.predict_proba(X_te) for model in members])

        # mean across ensemble members
        y_ensemble_pred = np.average(y_preds, weights=params["weights"], axis=0)
    else:
        estimators = [(f'expert_{i}', members[i]) for i in range(len(members))]

        # only final estimator should be fitted here
        clf = StackingClassifier(
            estimators=estimators, final_estimator=LogisticRegression())
        X_tr = params["X_tr"]
        print(X_tr.columns.tolist())
        y_tr = params["y_tr"]

        clf.fit(X_tr, y_tr.values.ravel())

        y_ensemble_pred = clf.predict_proba(X_te)

    return y_ensemble_pred
    # ('1_5', DecisionTreeClassifier(max_depth=9)),   
    # ('1_6', RandomForestClassifier(max_depth=12, n_estimators=13, max_features=11)),
    ]
stack_clf = StackingClassifier(estimators=base_learners,
                          final_estimator=LogisticRegression(C=0.123456789, solver="lbfgs", max_iter=5000),  
                          cv=10)
# stack_clf.fit(xtrain, ytrain)
# stack_acc=stack_clf.score(xtest, ytest)
# print('stack_acc',stack_acc)
print('1')
score=cross_validate(stack_clf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean()
print(f"{score:.6f}")

stack_clf.fit(X_train, y_train)

pred=stack_clf.predict_proba(X_test)[:,1]

pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("stacking_submission.csv", index=False)

from sklearn.ensemble import VotingClassifier
base_learners  = [
    # ('1_1',AdaBoostClassifier()),
    # ('1_2',GaussianNB()),
    #('1_1',MLPClassifier(alpha=1, max_iter=1000)),
    ('1_2',LogisticRegression(C=0.123456789, solver="lbfgs", max_iter=5000)),
    ('1_4', KNeighborsClassifier(n_neighbors=7,weights='distance'))
    # ('1_5', DecisionTreeClassifier(max_depth=9)),   
    # ('1_6', RandomForestClassifier(max_depth=12, n_estimators=13, max_features=11)),
    ]
v_clf = VotingClassifier(estimators=base_learners,voting='soft')
Пример #9
0
X = pd.DataFrame({'Yamnet': y_predicted_yamnet, 'SVM': pd.Series(y_pred_svm)})

estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('svr',
               make_pipeline(StandardScaler(), LinearSVC(random_state=42)))]
clf = StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)
clf.fit(X_train, y_train).score(X_test,
                                y_test)  #  y_test == y_real.iloc[X_test.index]
y_pred_combined = clf.predict_proba(
    X_test)[:, 1]  # The probability of getting the output as 1 (cough)
Confusion_Matrix(y_test, y_pred_combined, pred_prob=True)

y_pred_combined = clf.predict_proba(X)[:, 1]
y_real, y_predicted_combined = Confusion_Matrix(y,
                                                y_pred_combined,
                                                pred_prob=True)

X_new = pd.DataFrame({'Yamnet': [0], 'SVM': [0.95]})
clf.predict_proba(X_new)[:, 1]

# Import Joblib Module from Scikit Learn

import joblib

# Save RL_Model to file in the current working directory
Пример #10
0
def run(dataset, config):
    log.info(
        f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config
    estimators_params = {
        e: config.framework_params.get(f'_{e}_params', {})
        for e in ['rf', 'gbm', 'linear', 'svc', 'final']
    }

    log.info(
        "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores."
        .format(config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    if is_classification:
        estimator = StackingClassifier(
            estimators=[
                ('rf',
                 RandomForestClassifier(n_jobs=n_jobs,
                                        random_state=config.seed,
                                        **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingClassifier(random_state=config.seed,
                                            **estimators_params['gbm'])),
                ('linear',
                 SGDClassifier(n_jobs=n_jobs,
                               random_state=config.seed,
                               **estimators_params['linear'])),
                # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
            ],
            # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
            final_estimator=LogisticRegression(n_jobs=n_jobs,
                                               random_state=config.seed,
                                               **estimators_params['final']),
            stack_method='predict_proba',
            n_jobs=n_jobs,
            **training_params)
    else:
        estimator = StackingRegressor(
            estimators=[
                ('rf',
                 RandomForestRegressor(n_jobs=n_jobs,
                                       random_state=config.seed,
                                       **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingRegressor(random_state=config.seed,
                                           **estimators_params['gbm'])),
                ('linear',
                 SGDRegressor(random_state=config.seed,
                              **estimators_params['linear'])),
                ('svc',
                 LinearSVR(random_state=config.seed,
                           **estimators_params['svc']))
            ],
            # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
            final_estimator=LinearRegression(n_jobs=n_jobs,
                                             random_state=config.seed,
                                             **estimators_params['final']),
            n_jobs=n_jobs,
            **training_params)

    with utils.Timer() as training:
        estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)
    probabilities = estimator.predict_proba(
        X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.estimators_) + 1,
                  training_duration=training.duration)
Пример #11
0
X_train , X_test ,y_train, y_test = train_test_split(X , y , stratify = y)

basetree = DecisionTreeClassifier( criterion="gini" , min_samples_split=0.4)
clf1 = AdaBoostClassifier(n_estimators=50 , learning_rate=0.5)



# 1
# basetree= DecisionTreeClassifier( criterion="entropy" , min_samples_split=0.4)
# AdaBoostClassifier(basetree,n_estimators=50 , learning_rate=0.5)
# 0.86


clf2 = GradientBoostingClassifier(init=basetree ,subsample=0.8 , max_features=0.8)


clf1.fit(X_train , y_train)
roc_auc_score(y_test , clf1.predict_proba(X_test)[:,1])


from sklearn.ensemble import StackingClassifier

clf = StackingClassifier(
        n_jobs=-1
        ,estimators=[ ('gbc' , clf2)
        , ('abc' , clf1) ])

clf.fit(X_train , y_train)
roc_auc_score(y_test , clf.predict_proba(X_test)[:,1])
Пример #12
0
reg = StackingClassifier(estimators=estimators,
                         final_estimator=KNeighborsClassifier(n_neighbors=11))
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

acc = accuracy_score(y_test, y_pred)
print("accuracy score %0.2f%%" % (acc * 100))

#ROC and AUC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

clf_probs = reg.predict_proba(x_test)
clf_probs = clf_probs[:, 1]
print(clf_probs)
ras = roc_auc_score(y_test, clf_probs)
print("Logistic : ROC AUC = %.3f" % (ras))
from sklearn.preprocessing import label_binarize

y = label_binarize(y_test, classes=[1, 2])
n_classes = y.shape[1]
fpr, tpr, _ = roc_curve(y, clf_probs)
plt.figure()
lw = 2
plt.plot(fpr,
         tpr,
         color="orange",
         lw=lw,
from sklearn.linear_model import SGDClassifier

sgdClass = SGDClassifier(loss='log', random_state=2, verbose=2)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=11)

models_considered = [('Logistic Regression', logreg), ('Dtree', dtc),
                     ('sgd', sgdClass), ('knn', knn)]

from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=2000, n_estimators=500, verbosity=2)

from sklearn.ensemble import StackingClassifier

stack = StackingClassifier(estimators=models_considered,
                           final_estimator=xgb,
                           stack_method="predict_proba",
                           passthrough=True)

stack.fit(X_train, y_train)

y_pred_prob = stack.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_pred_prob)
Пример #14
0
# varimp.sort_values(by="values", ascending=True, inplace=True)
# sns.scatterplot(x=varimp["values"],y=varimp["features"])


#_____________________________________________________________________________________________________________________________
#Stacking Classifier



est=estimators = [('rf', RandomForestClassifier(random_state=4, max_features="auto", min_samples_leaf=5,min_samples_split=4, n_estimators=1500)), 
                  ('lr', LogisticRegression(penalty="l2", max_iter=500))]
meta=GradientBoostingClassifier(learning_rate=0.01, n_estimators=500, max_features="sqrt", min_samples_leaf=5)
ms=StackingClassifier(estimators=est, final_estimator=meta, stack_method="predict_proba", passthrough=True).fit(TrainX, TrainY)


Validpred_prob=ms.predict_proba(ValidX)[:,1]
Validpred=np.where(Validpred_prob>0.50,1,0)
confusion_matrix(Validpred, ValidY)
f1_score(Validpred, ValidY)
RFS=sum(np.diagonal(confusion_matrix(Validpred, ValidY)))/ValidY.shape[0]*100
#_______________________________________________________________________________


#_____________________________________________________________________________________________________________________________

#finaloutput
Test_pred_prob=ms.predict_proba(TestX)[:,1]
Survived=np.where(Test_pred_prob>0.50,1,0)
submission=pd.DataFrame({"PassengerId":id, "Survived":Survived})
submission.to_csv("titanic01_RFLRGB_GB.csv", index=False)
Пример #15
0
sclf.fit(train_data, train_label)
tra_label = sclf.predict(train_data)  # 训练集的预测标签
tes_label = sclf.predict(test_data)  # 测试集的预测标签
print("训练集:", accuracy_score(train_label, tra_label))
print("测试集:", accuracy_score(test_label, tes_label))

matrix = confusion_matrix(test_label, tes_label, labels=[0, 1])
TP = matrix[1, 1]
TN = matrix[0, 0]
FP = matrix[0, 1]
FN = matrix[1, 0]
sn = TP / (TP + FN)
sp = TN / (TN + FP)

decision_score = sclf.predict_proba(test_data)
fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1])

# plt.plot(fprs, tprs)
# plt.show()
roc_auc = auc(fprs, tprs)
plt.figure()
lw = 2
plt.plot(fprs,
         tprs,
         color='darkorange',
         lw=lw,
         label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
from sklearn.utils.estimator_checks import parametrize_with_checks


@parametrize_with_checks([LogisticRegression(), DecisionTreeRegressor()])
def test_sklearn_compatible_estimator(estimator, check):
    check(estimator)


# %%
# ROC AUC now supports multiclass classification
# ----------------------------------------------
# The :func:`roc_auc_score` function can also be used in multi-class
# classification. Two averaging strategies are currently supported: the
# one-vs-one algorithm computes the average of the pairwise ROC AUC scores, and
# the one-vs-rest algorithm computes the average of the ROC AUC scores for each
# class against all other classes. In both cases, the multiclass ROC AUC scores
# are computed from the probability estimates that a sample belongs to a
# particular class according to the model. The OvO and OvR algorithms support
# weighting uniformly (``average='macro'``) and weighting by the prevalence
# (``average='weighted'``).
#
# Read more in the :ref:`User Guide <roc_metrics>`.

from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

X, y = make_classification(n_classes=4, n_informative=16)
clf = SVC(decision_function_shape='ovo', probability=True).fit(X, y)
print(roc_auc_score(y, clf.predict_proba(X), multi_class='ovo'))
Пример #17
0
]
#('rf', grid_search_rf.best_estimator_)]

# In[257]:

clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(
        random_state=20202020),  # logreg is better than gbm
    stack_method='predict_proba')

clf.fit(X_tr, y_tr)

# In[258]:

results = clf.predict_proba(X_val)[:, 1]
act = y_val.array

roc_auc_score(act, results)

# #### 71.246 best on validation
# with lgbm and gbm as base learners

# ## CatBoost (left out of Stack model - takes forever to train)

# In[111]:

from catboost import CatBoostClassifier

# In[116]:
Пример #18
0
 allmodmeanaucs.append((RFmean_auc,'RF'))
 print("RFmodels done.")
 for k in Stackmodels.keys():
     modname = k
     modelresults = Stackmodels[k]
     Stack = modelresults['Stack']
     X_test = modelresults['X_test']
     Y_test = modelresults['Y_test']
     X_train = modelresults['X_train']
     Y_train = modelresults['Y_train']
     try:
         Y_pred = Stack.predict_proba(X_test)[:, 1]
     except Exception as e:
         params = Stack.get_params()
         Stack = StackingClassifier(estimators=params['estimators'], final_estimator=params['final_estimator'], cv=params['cv'], stack_method=params['stack_method'], n_jobs=-1).fit(X_train, Y_train)
         Y_pred = Stack.predict_proba(X_test)[:,1]
     fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
     Stacktprs.append(np.interp(Stackmean_fpr, fpr, tpr))
     Stacktprs[-1][0] = 0.0
     roc_auc = auc(fpr,tpr)
     Stackaucs.append(roc_auc)
     Stackrocs.append((fpr,tpr,roc_auc,modname))
 Stackrocs_sorted = sorted(Stackrocs,key=lambda x:x[2],reverse=True)
 Stackmean_tpr = np.mean(Stacktprs, axis=0)
 Stackmean_tpr[-1] = 1.0
 Stackmean_auc = auc(Stackmean_fpr, Stackmean_tpr)
 Stackstd_auc = np.std(Stackaucs,ddof=1)
 Stackstd_tpr = np.std(Stacktprs, axis=0)
 Stacktprs_upper = np.minimum(Stackmean_tpr + Stackstd_tpr, 1)
 Stacktprs_lower = np.maximum(Stackmean_tpr - Stackstd_tpr, 0)
 Stackrocresults = {'Stackmodels':Stackmodels,'Stackrocs':Stackrocs_sorted, 'Stackmean_tpr':Stackmean_tpr, 'Stackmean_fpr':Stackmean_fpr, 'Stacktprs_upper':Stacktprs_upper, 'Stacktprs_lower':Stacktprs_lower}
Пример #19
0
train_scores2 = []
clfs = []
for train_index, validate_index in kf.split(X):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = t[train_index], t[validate_index]
    clf = clf.fit(X_train, y_train)
    clfs.append(clf)
    score = clf.score(X_validate, y_validate)
    scores.append(score)
    print(score)
    score = clf.score(X_train, y_train)
    train_scores2.append(score)
    index.append([train_index, validate_index])
    print(score)

max_index = scores.index(max(scores))
clf = clfs[max_index]

result = clf.predict_proba(values_test1)
p = features2.index('listing_id')

list_id = values2[:, p].reshape((values2.shape[0], 1))
result = np.append(list_id, result, axis=1)
print(result)
data = pd.DataFrame(result, columns=['listing_id', 'low', 'medium', 'high'])
# data = pd.DataFrame(result, columns=['listing_id', 'high', 'low', 'medium'])
# cols = list(data)
# cols.insert(2, cols.pop(cols.index('medium')))
# data = data.loc[:, cols]
data.to_csv('submission.csv', index=None)
Пример #20
0
rf_only.fit(X_train, y_train)
pred_prob_rf = rf_only.predict_proba(X_validation)

# %%
plot, random_forest_roc = roc_multiclass(pred_prob_rf, y_validation, 0)

# %%
random_forest_roc

# %% [markdown]
# Evaluamos un modelo tipo stack:

# %%
if long_run:
    stacking_model.fit(X_train, y_train)
    pred_prob_stack = stacking_model.predict_proba(X_validation)
    plot, stacking_roc = roc_multiclass(pred_prob_stack, y_validation, 0)
    print(stacking_roc)
    print(
        "Accurary \nKnn solo: {:.2f} \nRandom forest solo: {:.2f} \nStacking: {:.3f}"
        .format(knn_roc, random_forest_roc, stacking_roc))

# %%

# %%

# %%

# %%

# %% [markdown]
duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)
#%%
stacking_clf = StackingClassifier(estimators=estimators,
                                   final_estimator=LogisticRegression(),
                                   passthrough=False,
                                   stack_method='auto', 
                                   n_jobs=-1)

X_train=features_train_scaled.iloc[:,feature_imp_union]
X_test=features_test_scaled.iloc[:,feature_imp_union]
y_train=response_train

stacking_fit=stacking_clf.fit(X_train, y_train)
stacking_proba=stacking_clf.predict_proba(X_test)
pd.DataFrame(stacking_proba).to_csv('stacking_proba.csv')

#%%
 import winsound
 duration = 1000  # milliseconds
 freq = 440  # Hz
 winsound.Beep(freq, duration)
# =============================================================================






        n_jobs = -1,
    )

    #--------------------
    # モデルの学習処理
    #--------------------
    model.fit(X_train, y_train)

    #--------------------
    # モデルの推論処理
    #--------------------
    if( args.output_type == "fixed" ):
        y_preds_train = model.predict(X_train)
        y_preds_test = model.predict(X_test)
    else:
        y_preds_train = model.predict_proba(X_train)
        y_preds_test = model.predict_proba(X_test)

    print( "y_preds_train.shape: ", y_preds_train.shape )
    print( "y_preds_test.shape: ", y_preds_test.shape )

    accuracy = (y_train == y_preds_train).sum()/len(y_preds_train)
    print( "accuracy [k-fold CV train-valid] : {:0.5f}".format(accuracy) )

    #================================
    # 可視化処理
    #================================
    # 分類対象の分布図
    fig = plt.figure()
    axis = fig.add_subplot(111)
    sns.distplot(df_train['Survived'], label='correct' )
Пример #23
0
#Base estimator XGBoost and LGBM Classifier, final estimator Logistic Regression  
estimators = [('xgb',clf_xgb),('lgb',clf_lgb)]
clf_stacking1 = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression())

#Base estimator XGBoost, LGBM Classifier and CatBoost, final estimator Logistic Regression
estimators = [('xgb',clf_xgb),('lgb',clf_lgb),('catboost',clf_cat)]
clf_stacking2 = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression())

#Base estimator Catboost and LGBM Classifier, final estimatorXgboost
estimators = [('catboost',clf_cat),('lgb',clf_lgb)]
clf_stacking3 = StackingClassifier(estimators=estimators,final_estimator=clf_xgb)

#for all the above three stacking models trained the data individually
clf_stacking1.fit(xtrain,ytrain)

predictions_probas = clf_stacking1.predict_proba(xvalid)
score=gini_normalized(yvalid,predictions_probas)
print(score)

print('Confusion matrix\n',confusion_matrix(yvalid,predictions))

#After analysing gini score fit the model on complete train data
clf_stacking1.fit(X_train1,Y_train1)

result = clf_stacking.predict_proba(X_out)
result

id=test1['id']
submit=pd.DataFrame({'id':id,'target':result1[:,1]})
submit=submit[['id','target']]
Пример #24
0
    max_iter     = 1500,
    random_state = 0
)
clf = StackingClassifier(
    estimators      = [('svm', svm)], 
    final_estimator = glm, 
    cv              = 5, 
    n_jobs          = 5
)
clf.fit(X_train, y_train)
dump(clf, 'output/svm_model.joblib')

## Test fitted model
logging.info("Predicting test set...")
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)
logging.info("Overall Accuracy: {:.2f}%".format(
    100 * metrics.accuracy_score(y_test, y_pred)
))
logging.info("Balanced Accuracy: {:.2f}%".format(
    100 * metrics.balanced_accuracy_score(y_test, y_pred)
))
logging.info("Micro F1-score: {:.2f}%".format(
    100 * metrics.f1_score(y_test, y_pred, average = "micro")
))
logging.info("Macro F1-score: {:.2f}%".format(
    100 * metrics.f1_score(y_test, y_pred, average = "macro")
))
logging.info("Log-loss: {:.5f}".format(
    metrics.log_loss(y_test, y_prob)
))
Пример #25
0
def find_steady_coalition():
    x_train, y_train, x_val, y_val, x_test, y_test = load_data()

    # trying to implement LDA with Least Squares solver
    #clf = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto', store_covariance=True)
    #clf = RandomForestClassifier(criterion='gini', max_depth=50, min_samples_split=5, n_estimators=50)
    estimators = [('Random Forest',
                   RandomForestClassifier(criterion='gini',
                                          max_depth=50,
                                          min_samples_split=5,
                                          n_estimators=50)),
                  ('SVC', SVC(kernel='poly', degree=4, probability=True)),
                  ('Percepton',
                   MLPClassifier(activation="relu",
                                 alpha=0.1,
                                 hidden_layer_sizes=(10, 10, 10),
                                 learning_rate="constant",
                                 max_iter=2000))]
    clf = StackingClassifier(estimators)
    clf.fit(x_train, y_train)
    parties_list = np.unique(y_train.values)
    feature_to_index_map = {
        clf.classes_[i]: i
        for i in range(len(clf.classes_))
    }

    probabilities_per_voter = clf.predict_proba(x_val)

    best_coalition = []
    best_coalition_v_score = float(-np.inf)
    best_coalition_homo = float(-np.inf)

    for possible_coalition in get_possible_coalitions(parties_list):
        y_coalition = np.isin(y_val.values.ravel(), possible_coalition)
        probabilities_coalition = np.sum(probabilities_per_voter[:, [
            feature_to_index_map[feat] for feat in possible_coalition
        ]],
                                         axis=1)
        coalition_score = np.mean(probabilities_coalition)

        if (coalition_score < 0.51):
            continue

        voters_likely_to_vote = [
            voter > 0.5 for voter in probabilities_coalition
        ]
        v_score = v_measure_score(y_coalition, voters_likely_to_vote)
        homo_score = homogeneity_score(y_coalition, voters_likely_to_vote)
        #print('Homogeneity score: {} \nV-Measure score: {} '.format(homo_score, v_score))
        #print('Predicition mean {} and std {}'.format(val_predict_score, standart_deviation))

        if v_score > best_coalition_v_score:
            best_coalition = possible_coalition
            best_coalition_v_score = homo_score

    plot_coalition(x_train, y_train, best_coalition)
    print(best_coalition)
    print('Coalition: {}'.format(best_coalition))
    print('{} coalition votes vs {} '.format(
        np.sum(voters_likely_to_vote),
        len(y_val) - np.sum(voters_likely_to_vote)))

    #lda_tst = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto', store_covariance=True)
    prediction = clf.predict(x_test)

    performance_data = [
        ('Random Forest + SVC + Neueron Network stacking method', prediction,
         y_test)
    ]
    print_accuracy_scores(performance_data)
    print_f1_score(performance_data)
Пример #26
0

# Third
forest = RandomForestClassifier(n_estimators=13,
                                criterion='entropy',
                                min_samples_leaf=10,
                                max_leaf_nodes=20,
                                random_state=80)
forest.fit(trainData, Y)


# Logistic regression
lr = LogisticRegression(solver='lbfgs', random_state=80)
base_estimators = [('SVM', svm), ('Bagging DT', bagging), ('DecisionForest', forest)]
sclf = StackingClassifier(estimators=base_estimators, final_estimator=lr, cv=2)
sclf.fit(trainData, Y)

score1 = sclf.score(trainData, Y)
print('Logistic regression score:', score1)


# Predictions
names = ['dog.1049.jpg', 'dog.1028.jpg', 'dog.1011.jpg', 'cat.1016.jpg']

for name in names:
    singleImage = cv2.imread('data/test/' + name)
    histt = extract_histogram(singleImage)
    histt2 = histt.reshape(1, -1)
    prediction = sclf.predict(histt2)
    proba = sclf.predict_proba(histt2)
    print(f'Predictions for {name}:', proba)