Exemplo n.º 1
0
def test_stacking_classifier_iris(cv, final_estimator):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(scale(X_iris),
                                                        y_iris,
                                                        stratify=y_iris,
                                                        random_state=42)
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf = StackingClassifier(estimators=estimators,
                             final_estimator=final_estimator,
                             cv=cv)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 6

    clf.set_params(lr='drop')
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 3
Exemplo n.º 2
0
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(
        scale(X_iris), y_iris, stratify=y_iris, random_state=42
    )
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf = StackingClassifier(
        estimators=estimators, final_estimator=final_estimator, cv=cv,
        passthrough=passthrough
    )
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    expected_column_count = 10 if passthrough else 6
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])

    clf.set_params(lr='drop')
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    expected_column_count_drop = 7 if passthrough else 3
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])
Exemplo n.º 3
0
def run_ensemble(X_train, X_val, y_train, y_val, df_test):

    ### ENSEMBLE LEARNING with (naive) classification models

    from sklearn.ensemble import StackingClassifier, RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score

    import xgboost as xgb

    final_layer = StackingClassifier(
        estimators=[('knn', KNeighborsClassifier(n_neighbors=6))],
        final_estimator=xgb.XGBClassifier(objective="binary:logistic",
                                          random_state=42))
    model = StackingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(C=1, gamma=1e-6, kernel='rbf')),
    ],
                               final_estimator=final_layer)

    history = model.fit(X_train, y_train)

    print(accuracy_score(y_val, model.predict(X_val)))

    rank_results = test_results(df_test, alg="ensemble", model=model)
    return rank_results
def Model_Ensemble(best_score_param_estimator, xtrain, xtest, ytrain, ytest):
    from sklearn.ensemble import StackingClassifier
    # the base estimators
    best_score_1, best_param_1, estimator_1 = best_score_param_estimator[0]
    best_score_2, best_param_2, estimator_2 = best_score_param_estimator[1]
    best_score_3, best_param_3, estimator_3 = best_score_param_estimator[2]
    best_score_4, best_param_4, estimator_4 = best_score_param_estimator[3]
    best_score_5, best_param_5, estimator_5 = best_score_param_estimator[4]

    estimators = [('estimator_5', estimator_5.get_params()['model']),
                  ('estimator_4', estimator_4.get_params()['model']),
                  ('estimator_3', estimator_3.get_params()['model']),
                  ('estimator_2', estimator_2.get_params()['model'])]
    # the stacking classifer
    sc = StackingClassifier(estimators=estimators,
                            final_estimator=estimator_1.get_params()['model'])

    # train the stacking classifier on the training data
    sc.fit(xtrain, ytrain)
    y_test_pred = sc.predict(xtest)

    print(
        "--------------Model Ensemble----------------------------------------------------------------"
    )
    print("Accuracy:", '{:1.4f}'.format(accuracy_score(ytest, y_test_pred)))
    print("")
    print("Precision:", round(precision_score(ytest, y_test_pred), 4))
    print("")
    print("Recall:", round(recall_score(ytest, y_test_pred), 4))
    print("")
    print("f1-score:", round(f1_score(ytest, y_test_pred), 4))
    print("")
    print(classification_report(ytest, y_test_pred))
    print("")
    print(confusion_matrix(ytest, y_test_pred))
def main():

    np.random.seed(0)
    train_X, train_y, test_X, test_y = load_data()

    # Stacking models:
    # Create your stacked model using StackingClassifier
    level0 = list()
    level0.append(('rf', RandomForestClassifier(n_estimators=150,
                                                max_depth=5)))
    level0.append(('svm', SVC(C=1, kernel='rbf')))
    dtc = DecisionTreeClassifier(max_depth=3)
    level0.append(('ADA',
                   AdaBoostClassifier(n_estimators=100,
                                      base_estimator=dtc,
                                      learning_rate=0.1)))
    level0.append(('lr', LogisticRegression(solver='liblinear')))
    level0.append(('bayes', GaussianNB()))

    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0,
                               final_estimator=level1,
                               cv=10)
    # fit the model on the training data
    model.fit(train_X, train_y)
    # Get and print f1-score on test data
    y_pred = model.predict(test_X)
    F1_score = metrics.f1_score(y_pred, test_y, average='weighted')
    print("ANS 3.1 - F1_score of model with stacking different models is: " +
          str(F1_score))
def model_stacking_clf(X_train, y_train, X_test, y_test):
    """
    @param: X_train - a numpy matrix containing features for training data (e.g. TF-IDF matrix)
    @param: y_train - a numpy array containing labels for each training sample
    @param: X_test - a numpy matrix containing features for test data (e.g. TF-IDF matrix)
    @param: y_test - a numpy array containing labels for each test sample
    """
    estimators = [('rf',
                   PassiveAggressiveClassifier(n_jobs=-1,
                                               C=0.001,
                                               loss='squared_hinge',
                                               max_iter=1000,
                                               tol=1e-06)),
                  ('svr',
                   make_pipeline(
                       RandomForestClassifier(n_estimators=1000,
                                              random_state=42,
                                              criterion='gini',
                                              bootstrap=True,
                                              max_features='auto')))]

    clf = StackingClassifier(estimators=estimators,
                             final_estimator=LogisticRegression(penalty='l1',
                                                                solver='saga',
                                                                max_iter=500))
    clf.fit(X_train, y_train)

    y_predicted = clf.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_predicted)
    rf_f1 = f1_score(y_test, y_predicted, average="weighted")

    return rf_accuracy, rf_f1
Exemplo n.º 7
0
def stacking_classifier(best_logistic_regression, best_knn_classifier,
                        best_gaussian_nb, best_decision_tree_classifier,
                        best_random_forest_classifier, x_train, x_test,
                        y_train, y_test):
    from sklearn.ensemble import StackingClassifier

    estimators = [
        # ('random_forest_cv', best_random_forest_classifier),
        ('knn_classifier_cv', best_knn_classifier),
        ('dct_cv', best_decision_tree_classifier),
        ('gaussian_nb_cv', best_gaussian_nb)
    ]

    final_stacking_classifier = StackingClassifier(
        estimators=estimators,
        shuffle=False,
        use_probas=True,
        final_estimator=best_logistic_regression)

    final_stacking_classifier.fit(x_train, y_train)

    print("Stacking Classifier Training Score {}".format(
        final_stacking_classifier.score(x_train, y_train)))
    print("Stacking Classifier Testing Score {}\n".format(
        final_stacking_classifier.score(x_test, y_test)))

    y_predict = final_stacking_classifier.predict(x_test)
    classification_model = 'Stacking Classifier'

    confusion_matrix_graph(y_test, y_predict, classification_model)
    roc_curve_graph(y_test, y_predict, classification_model)
    def stacking(self):
        from sklearn.ensemble import StackingClassifier
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.svm import LinearSVC
        from sklearn.linear_model import LogisticRegression
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import make_pipeline

        estimators = [('rf',
                       RandomForestClassifier(n_estimators=500,
                                              max_leaf_nodes=16,
                                              n_jobs=-1)),
                      ('svr',
                       make_pipeline(StandardScaler(),
                                     LinearSVC(random_state=42)))]
        clf = StackingClassifier(estimators=estimators,
                                 final_estimator=LogisticRegression())

        clf.fit(self.X_train, self.y_train)
        y_pred = clf.predict(self.X_test)
        cf = confusion_matrix(self.y_test, y_pred)
        print(cf)
        acc = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)
        print(acc)
        print(report)
Exemplo n.º 9
0
def stacking_classifier(train_x, train_y, test_x):
    import lightgbm as lgb
    from rgf.sklearn import RGFClassifier
    from sklearn.ensemble import StackingClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
    from sklearn.metrics import mean_squared_error

    lgb_params = {
        # 'boosting': 'gbdt',
        'application': 'classifier',
        # 'learning_rate': 0.05,
        # 'min_data_in_leaf': 20,
        # 'feature_fraction': 0.7,
        # 'num_leaves': 41,
        'metric': 'auc'
        # 'drop_rate': 0.15
    }

    et_params = {
        'n_estimators': 20,
        'max_features': 0.5,
        'max_depth': 18,
        'min_samples_leaf': 4,
        'n_jobs': -1
    }

    rf_params = {
        'n_estimators': 20,
        'max_features': 0.2,
        'max_depth': 25,
        'min_samples_leaf': 4,
        'n_jobs': -1
    }

    rgf_params = {'algorithm': 'RGF_Sib', 'loss': 'Log'}

    kn_params = {'leaf_size': 10}

    estimators = [
        ('lgb', lgb.LGBMClassifier(**lgb_params)),
        # ('rgf', RGFClassifier(**rgf_params)),
        ('et', ExtraTreesClassifier(**et_params)),
        ('rf', RandomForestClassifier(**rf_params)),
        ('lr', LogisticRegression())
        # ('knn', KNeighborsClassifier(**kn_params))
    ]

    model_stack = StackingClassifier(estimators=estimators,
                                     final_estimator=LogisticRegression(),
                                     verbose=1)
    model_stack.fit(train_x, train_y)

    pred = model_stack.predict(test_x)

    return pred
Exemplo n.º 10
0
def stacking(X_train, y_train, X_test, y_test, model1, model2):
    print('Обучение алгоритма Stacking...\n')
    estimators = [('kmeans', model1), ('svc', model2)]
    clf = StackingClassifier(estimators=estimators)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print('Отчет классификации для метода Stacking: \n',
          classification_report(y_test, predictions))
    return clf
Exemplo n.º 11
0
def run():
        import numpy as np
        import pandas as pd
        import seaborn
        import matplotlib.pyplot as pyplot
        import seaborn as sns
        from sklearn.model_selection import train_test_split
        from xgboost import XGBClassifier
        from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
        from sklearn.svm import SVC
        svc=SVC(probability=True, kernel='linear')
        from sklearn.ensemble import GradientBoostingClassifier
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import StackingClassifier

        df = pd.read_table("./data/australian.csv", sep='\s+', header=None)
        y = df[14]
        X = df.drop(columns = 14)
        y.value_counts()
        # Split features and target into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.4)
        
        # Instantiate the Classifiers
        
        estimators = [('xgb', XGBClassifier()), ('gbdt', GradientBoostingClassifier(random_state=1))]
        
        clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
        
        clf.fit(X_train, y_train)
        # Make predictions for the test set
        y_pred_test = clf.predict(X_test)


        # View accuracy score
        
        print(classification_report(y_test, y_pred_test))

        clf_probs = clf.predict_proba(X_test)
        # keep probabilities for the positive outcome only
        clf_probs = clf_probs[:, 1]
        # calculate scores
        clf_auc = roc_auc_score(y_test, clf_probs)
        # summarize scores
        print('ensemble: ROC AUC=%.3f' % (clf_auc))
        print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True)))
        # calculate roc curves
        clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs)
        # plot the roc curve for the model
        pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Ensemble')
        # axis labels
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        # show the legend
        pyplot.legend()
        # show the plot
        pyplot.show()
Exemplo n.º 12
0
def model_stack(X_train, y_train, X_test, y_test):
    estimators = [('xgb', XGBClassifier()), ('lgb', lgb.LGBMClassifier())]
    model = StackingClassifier(estimators=estimators,
                               final_estimator=LogisticRegression())

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    stack_accuracy = f1_score(y_test, y_pred, average='weighted')
    stack_f1 = accuracy_score(y_test, y_pred)

    return stack_accuracy, stack_f1
Exemplo n.º 13
0
def test_stacking_classifier_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_iris),
                                                   y_iris,
                                                   stratify=y_iris,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))],
                             final_estimator=rf,
                             cv=5)
    clf_drop = StackingClassifier(estimators=estimators,
                                  final_estimator=rf,
                                  cv=5)

    clf.fit(X_train, y_train)
    clf_drop.fit(X_train, y_train)
    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
Exemplo n.º 14
0
def stacking_predictor(row):
    """
    Training stacking model with our data
    Define what our base layer will be composed of and then build
    a stacking classifier base
    on these models.
    set our final estimator as "logistic regression"

    """
    our_trained_data = pd.read_csv("data/data.csv")
    our_trained_data = clean_data(our_trained_data)

    x = our_trained_data[[
        'radius_mean', 'texture_mean', 'area_mean', 'concavity_mean',
        'concave points_mean', 'symmetry_mean', 'smoothness_mean'
    ]]
    y = our_trained_data[['diagnosis']]
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    x_train = x_train.values.tolist()
    y_train = y_train.values.tolist()
    flattened_y_train = []
    for sub_list in y_train:
        for val in sub_list:
            flattened_y_train.append(val)

    X, y = x_train, flattened_y_train

    estimators = [('random_forest',
                   RandomForestClassifier(n_estimators=5, random_state=42)),
                  ('logistic_regr',
                   LogisticRegression(solver="lbfgs", max_iter=1460)),
                  ('knn', KNeighborsClassifier(n_neighbors=5)),
                  ('svm_rbf', SVC(kernel='rbf', gamma=4, C=10000))]

    Stacking_classifier = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(), cv=5)

    # Fit the stacking model with our own data and with selected 7 features.
    Stacking_classifier.fit(X, y)

    # Now predicting one patient
    single_predicted_result = Stacking_classifier.predict([row])

    return ('%s %d' % ("patient", single_predicted_result))
Exemplo n.º 15
0
    def predict(self):
        X_train,y_train = self.train_data.iloc[:,:-1], self.train_data.iloc[:,-1]

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        level_0 = list()
        level_0.append(('RF', RandomForestClassifier(n_estimators=700)))
        level_0.append(('LR',LogisticRegression(max_iter=6000)))
        
        level_1 = SVC(C=1.2)
        model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4)

        model.fit(X_train, y_train)
        test=scaler.transform(self.test_data)
        submission = model.predict(test)
        submission = pd.DataFrame(submission)
        submission.to_csv('submission.csv',header=['quality'],index=False)
def main():
    args = parse_arguments()
    # params
    DATA_DIR = args.data_path
    num_folds = args.fold
    seed = 1234

    # setup data
    with open(DATA_DIR + '/features.txt') as f:
        features_txt = f.readlines()
    features_name = [x.strip() for x in features_txt]
    features_name = [
        "".join(c if c.isalnum() else "_" for c in str(x))
        for x in features_name
    ]
    X_train = pd.read_csv(DATA_DIR + '/X_train.csv', names=features_name)
    X_test = pd.read_csv(DATA_DIR + '/X_test.csv', names=features_name)
    y_train = pd.read_csv(DATA_DIR + '/y_train.csv', names=['activity_label'])
    subject_train = pd.read_csv(DATA_DIR + '/subject_train.csv',
                                names=['subject_id'])

    # 0始まりにする
    y_train['activity_label'] = y_train['activity_label'] - 1

    # set up models
    estimators = [('rf',
                   RandomForestClassifier(n_estimators=300,
                                          random_state=seed)),
                  ('svr', SVC(probability=True, random_state=seed)),
                  ('knn', KNeighborsClassifier())]
    final_estimator = LogisticRegression(random_state=seed)
    kf = GroupKFold(n_splits=num_folds)
    cv_idx = kf.split(X=subject_train, groups=subject_train)
    clf = StackingClassifier(estimators=estimators,
                             final_estimator=final_estimator,
                             cv=cv_idx)

    # train
    clf.fit(X_train, y_train)

    # make submission
    test_preds = clf.predict(X_test)
    submit = test_preds + 1
    np.savetxt('baseline.txt', submit)
Exemplo n.º 17
0
def Stacking(x,y,time_split_sample=time_split_sample,split= 0.2):
    X_train, X_test, y_train, y_test = Train_Test_Split(x, y, time=time_split_sample)    

    estimators=[
#                 ('Logist', LogisticRegression(multi_class='multinomial',max_iter=1000)),
                 ('DecisionTree',tree.DecisionTreeClassifier(class_weight='balanced',max_depth=3)),
                 ('SVC', SVC()),
                 ('NB', GaussianNB())
                 ]
    lv2 = [
            ('DecisionTree',tree.DecisionTreeClassifier(class_weight='balanced',max_depth=5),
             [{'criterion': ['gini'], 'splitter': ['best', 'random'], 'max_depth': [2,6,8,12], 'min_samples_split': [3,5]},]
             ),
            ('NB',GaussianNB(),
             [{ 'var_smoothing':[1e-9,1e-11]}]
            ),
            ('Logist', LogisticRegression(multi_class='multinomial'),
             [{'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01]}]
             )
            ]

    stacking_rst = []
    aum_rst = []
    for i in lv2:
        est = i[1]
        para = i[2]
        gs_clf = model_selection.GridSearchCV(est, para, scoring = scorer['f0.5_macro'],
                                           cv = model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = 2020))
        clf = StackingClassifier(estimators=estimators, final_estimator=gs_clf).fit(X_train, y_train)
        y_pred = pd.Series(clf.predict(X_test), index=X_test.index)
        scores = ScoreFunc(y_test, y_pred)
        scores.name = i[0]
        print(scores)
        aum = PredictedReturn(y_pred, method=plot_method,title=i[0])
        plt.show()
        aum_rst.append(aum)
        stacking_rst.append(scores)
    score_rst = pd.concat(stacking_rst, axis=1)
    aum_rst = pd.concat(aum_rst, axis=1)
    aum_rst.columns = [i[0] for i in lv2]
#    aum_rst['Benchmark'] = (PredictedReturn(y_test, method=plot_method))
    aum_rst.plot(title='Stacking Final_estimator GridSearch')
    return score_rst, aum_rst
def stackingClassifier(Feature_train, y_train, Feature_test):
    layer_one_estimators = [('rf_1',
                             DecisionTreeClassifier(max_depth=6,
                                                    max_features=15)),
                            ('knn_1', KNeighborsClassifier(n_neighbors=35))]

    layer_two_estimators = [('dt_2',
                             DecisionTreeClassifier(max_depth=6,
                                                    max_features=15)),
                            ('rf_2', svm.SVC())]

    layer_two = StackingClassifier(estimators=layer_two_estimators,
                                   final_estimator=LogisticRegression())

    clf = StackingClassifier(estimators=layer_one_estimators,
                             final_estimator=layer_two)
    clf = clf.fit(Feature_train, y_train)
    y_pred = clf.predict(Feature_test)
    return y_pred
Exemplo n.º 19
0
class stacked_model(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models = None, meta_model = None, n_folds = None):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self,X,y):
        level0 = []
        for name, model in self.base_models:
            level0.append((name, model))
        level1 = self.meta_model
        self.get_stacking_ = StackingClassifier(estimators = self.base_models, final_estimator = level1, cv = self.n_folds)
        self.get_stacking_.fit(X,y)
        
        return self
    
    def predict(self, X):
        y_pred = self.get_stacking_.predict(X)
        return y_pred
Exemplo n.º 20
0
def Model_1(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = OneHotEncoder(sparse=False).fit_transform(
        [[x for x in s] for s in train['Sequence']])
    X_test = OneHotEncoder(sparse=False).fit_transform(
        [[x for x in s] for s in test['Sequence']])
    Y_train = train['label']

    X_train, Y_train = RandomUnderSampler(random_state=100).fit_resample(
        X_train, Y_train)
    X_train, Y_train = shuffle(X_train, Y_train, random_state=100)

    # Training
    estimators = [('rf',
                   RandomForestClassifier(n_estimators=300,
                                          max_depth=45,
                                          min_samples_leaf=7,
                                          random_state=100)),
                  ('mlp', MLPClassifier(max_iter=200, random_state=100)),
                  ('knn', KNeighborsClassifier(n_neighbors=4))]

    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(random_state=100),
        n_jobs=-1,
        verbose=1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_pred = clf.predict(X_test)
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_1.csv", index=False)
    result["Label"] = Y_pred
    result.to_csv("Predictions_1.csv", index=False)
Exemplo n.º 21
0
def stack_ensemble():
    '''
    Create StackingClassifier model
    Parameters:
        N/A
    Returns:
        N/A
    Outputs:
        confusion_matrix,
        classification_report,
        scoring
    '''

    WOE_encoder = WOEEncoder()
    X_train_enc = WOE_encoder.fit_transform(X_train, y_train)
    X_test_enc = WOE_encoder.transform(X_test)

    scaler = MinMaxScaler()
    X_train_enc_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_enc, y_train))
    X_test_enc_scaled = pd.DataFrame(scaler.transform(X_test_enc))

    clfs = list()
    clfs.append(('linSVC', LinearSVC()))
    clfs.append(('bayes', GaussianNB()))
    clfs.append(('knn', KNeighborsClassifier()))
    clfs.append(('rfc', RandomForestClassifier()))
    # define meta learner model
    meta_clf = LogisticRegression()
    # define the stacking ensemble
    stk_model = StackingClassifier(estimators=clfs,
                                   final_estimator=meta_clf,
                                   cv=3)

    # fit the model on training data
    stk_model.fit(X_train_enc_scaled, y_train)
    stk_pred = stk_model.predict(X_test_enc_scaled)
    print('Stack Accuracy :', accuracy_score(y_test, stk_pred))
    print('stack F1 :', f1_score(y_test, stk_pred))
    print(confusion_matrix(y_test, stk_pred))
    print(classification_report(y_test, stk_pred))
Exemplo n.º 22
0
def main():
    np.random.seed(0)
    train_X, train_y, test_X, test_y = load_data()

    # Stacking models:
    # Create your stacked model using StackingClassifier
    base_models = [('rfc', RandomForestClassifier()), ('svm', SVC()),
                   ('gnb', GaussianNB()), ('knc', KNeighborsClassifier()),
                   ('dtc', DecisionTreeClassifier())]

    # The default final_estimator is LogisticRegression
    sc = StackingClassifier(estimators=base_models)

    # fit the model on the training data
    sc.fit(train_X, train_y)

    # predict
    y_pred = sc.predict(test_X)

    # Get and print f1-score on test data
    print(f"f1 score = {f1_score(y_pred, test_y , average = 'weighted')}")
Exemplo n.º 23
0
vectorizer = TfidfVectorizer()

print([[" ".join(i) for i in p] for p in pos_filtered_data][0])

data = vectorizer.fit_transform(
    [" ".join([" ".join(i) for i in p]) for p in pos_filtered_data])

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    labels,
                                                    test_size=0.33,
                                                    random_state=42)

clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))

# #### doc2vec with KNN

# print(pos_filtered_data[0])

# glued_data = []
# for item in pos_filtered_data:
#     new_item = []
#     for sent in item:
#         new_item.append(" ".join(sent))
#     glued_data.append(". ".join(new_item))

# print(glued_data[0])

# documents = [TaggedDocument(doc[1], [i]) for i, doc in enumerate(glued_data)]
clf_hgbc = HistGradientBoostingClassifier()
clf_hgbc.fit(x_train, y_train)
hgbc_pred = clf_hgbc.predict(x_test)
hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred)
# ############################################################
# ############################################################ LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
clf_pred = clf_lr.predict(x_test)
lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred)
# ############################################################
# ############################################################ StackingClassifier
clf_sc = StackingClassifier(estimators=estimators,
                            final_estimator=LogisticRegression())
clf_sc.fit(x_train, y_train)
clf_pred = clf_sc.predict(x_test)
sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred)
# ############################################################
# ############################################################   VotingClassifier
clf_vc = VotingClassifier(estimators=[("knn", clf_knn), ('adab', clf_adab),
                                      ('rfc', clf_rfc), ('gnc', clf_gbc),
                                      ("bc", clf_bc), ("etc", clf_etc),
                                      ("hgbc", clf_hgbc), ('xgb', clf_xgb),
                                      ("lr", clf_lr)],
                          voting='soft')

clf_vc.fit(x_train, y_train)
clf_pred = clf_vc.predict(x_test)
vc_matrices = evaluate_preds(clf_vc, x_test, y_test, clf_pred)

# ############################################################
})
result.to_csv('result_xgb.csv', index=False)

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
alg2 = SVC(probability=True, random_state=29, C=11, gamma=0.05)
rf_clf = RandomForestClassifier()
mv_clf = VotingClassifier(estimators=[('lr', clf), ('xgb', model),
                                      ('svc', alg2), ('rf', rf_clf)],
                          voting='hard')
mv_clf.fit(X, y)
predictions = mv_clf.predict(test_feature)
result = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions.astype(np.int32)
})
result.to_csv("result_voting.csv", index=False)

from sklearn.ensemble import StackingClassifier
stacking_clf = StackingClassifier(estimators=[('xgb', model), ('svc', alg2),
                                              ('rf', rf_clf)],
                                  final_estimator=clf)
stacking_clf.fit(X, y)
predictions = stacking_clf.predict(test_feature)
result = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions.astype(np.int32)
})
result.to_csv('result_stacking.csv', index=False)
Exemplo n.º 26
0
class ModelFactory(object):
    def __init__(self):

        self.model = None

        self.dataset = pd.read_csv(
            "./heart_failure_clinical_records_dataset.csv")
        self.X_ori = self.dataset.drop(
            columns=['DEATH_EVENT'])[selectedFeatures]
        self.y = self.dataset['DEATH_EVENT']
        col_names = list(self.X_ori.columns)
        self.stdScaler = preprocessing.StandardScaler()
        self.stdScaler.fit(self.X_ori)
        self.X = self.stdScaler.transform(self.X_ori)
        self.X = pd.DataFrame(self.X, columns=col_names)
        self.X_val = None
        self.y_val = None
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.t_test = None

    def getModel(self):
        self.__genDecisionTree()
        self.__genBoostTree()
        self.__genLR()
        self.__genSVM()
        self.__genKNN()
        self.__genRF()
        #estimators=[('KNN', self.KNN), ('SVC', self.SVM)]
        self.model = StackingClassifier(estimators=[('LR', self.LR),
                                                    ('KNN', self.KNN)],
                                        final_estimator=self.SVM)
        self.model.fit(self.X_train, self.y_train)
        #self.model = make_pipeline(self.stdScaler, self.vote)
        path = "./temp/model.joblib"
        modelDump = open(path, "wb")
        dump(self.model, modelDump)
        modelDump.close()

    def printValidationSet(self):
        print(self.X_val)
        print(self.y_val)

    def genDataSet(self, train=0.8, test=0.1, val=0.1):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=1 - train,
            random_state=2)  #, stratify=self.y)
        self.X_val, self.X_test, self.y_val, self.y_test = train_test_split(
            self.X_test,
            self.y_test,
            test_size=test / (test + val),
            random_state=2)  #stratify=self.y_test)

    def __genBoostTree(self):
        rng = np.random.RandomState(42)
        self.BT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None),
                                     n_estimators=10,
                                     random_state=rng)
        #self.BT.fit(self.X_train, self.y_train)

    def __genDecisionTree(self):
        self.DT = DecisionTreeClassifier()
        #self.DT.fit(self.X_train, self.y_train)

    def __genKNN(self):
        self.KNN = KNeighborsClassifier(n_neighbors=18)
        #self.KNN.fit(self.X_train, self.y_train)

    def __genRF(self):
        self.RF = RandomForestClassifier(n_estimators=10)
        #self.RF.fit(self.X_train, self.y_train)

    def __genGNB(self):
        self.GNB = GaussianNB()
        #self.GNB.fit(self.X_train, self.y_train)

    def __genLR(self):
        self.LR = LogisticRegression(solver='liblinear',
                                     max_iter=1000,
                                     penalty='l1',
                                     C=0.01)

    # self.LR.fit(self.X_train, self.y_train)

    def __genSVM(self):
        self.SVM = SVC(kernel='linear', C=1e2, gamma=1e-04, probability=True)
        #self.SVM.fit(self.X_train, self.y_train)

    def getModelTestRes(self):
        y_pred = self.model.predict(self.X_test)
        acc = "Accuracy:", metrics.accuracy_score(self.y_test, y_pred)
        return ('Test score: {}'.format(acc))

    def getModetValRes(self):
        y_pred = self.model.predict(self.X_val)
        acc = "Accuracy:", metrics.accuracy_score(self.y_val, y_pred)
        return ('Validation score: {}'.format(acc))

    def predict(self, feature):
        input = feature[selectedFeatures]
        return self.model.predict(input)
Exemplo n.º 27
0
# %% Build pipeline
scaler = StandardScaler().fit(X_train)
encoder = LabelEncoder().fit(y_train)

X_train, y_train = scaler.transform(X_train), encoder.transform(y_train)
X_dev, y_dev = scaler.transform(X_dev), encoder.transform(y_dev)

# %%
estimators = [
    ('svm', LinearSVC(C=0.0001)),
    ('log', LogisticRegression(penalty='l2', C=0.001, max_iter=1000))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=GradientBoostingClassifier()
)

clf.fit(X_train, y_train)

pred_train, pred_dev = clf.predict(X_train), clf.predict(X_dev)
train_acc = clf.score(X_train, y_train)
dev_acc = clf.score(X_dev, y_dev)
train_uar = recall_score(y_train, pred_train, average='macro')
dev_uar = recall_score(y_dev, pred_dev, average='macro')

print(f"train_acc = {train_acc:.2f}, dev_acc = {dev_acc:.2f}")
print(f"train_uar = {train_uar:.2f}, dev_uar = {dev_uar:.2f}")

"""
train_acc = 0.83, dev_acc = 0.47
train_uar = 0.83, dev_uar = 0.47
"""
Exemplo n.º 28
0
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")
TEST_DATA_PATH = os.getenv("TEST_DATA_PATH")

train_data = pd.read_csv(TRAIN_DATA_PATH)
X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]

sc = StandardScaler()
X_tr = sc.fit_transform(X_train)

level_0 = list()
level_0.append(('RF', ExtraTreesClassifier(n_estimators=1000)))
level_0.append(('LR', LogisticRegression(max_iter=7000)))
level_1 = LinearDiscriminantAnalysis()
model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4)
model.fit(X_tr, y_train)

test_data = pd.read_csv(TEST_DATA_PATH)
X_te = sc.transform(test_data)
submission = model.predict(X_te)
submission = pd.DataFrame(submission)

submission.to_csv('submission.csv', header=['class'], index=False)
Exemplo n.º 29
0
def run(dataset, config):
    log.info(
        f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config
    estimators_params = {
        e: config.framework_params.get(f'_{e}_params', {})
        for e in ['rf', 'gbm', 'linear', 'svc', 'final']
    }

    log.info(
        "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores."
        .format(config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    if is_classification:
        estimator = StackingClassifier(
            estimators=[
                ('rf',
                 RandomForestClassifier(n_jobs=n_jobs,
                                        random_state=config.seed,
                                        **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingClassifier(random_state=config.seed,
                                            **estimators_params['gbm'])),
                ('linear',
                 SGDClassifier(n_jobs=n_jobs,
                               random_state=config.seed,
                               **estimators_params['linear'])),
                # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
            ],
            # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
            final_estimator=LogisticRegression(n_jobs=n_jobs,
                                               random_state=config.seed,
                                               **estimators_params['final']),
            stack_method='predict_proba',
            n_jobs=n_jobs,
            **training_params)
    else:
        estimator = StackingRegressor(
            estimators=[
                ('rf',
                 RandomForestRegressor(n_jobs=n_jobs,
                                       random_state=config.seed,
                                       **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingRegressor(random_state=config.seed,
                                           **estimators_params['gbm'])),
                ('linear',
                 SGDRegressor(random_state=config.seed,
                              **estimators_params['linear'])),
                ('svc',
                 LinearSVR(random_state=config.seed,
                           **estimators_params['svc']))
            ],
            # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
            final_estimator=LinearRegression(n_jobs=n_jobs,
                                             random_state=config.seed,
                                             **estimators_params['final']),
            n_jobs=n_jobs,
            **training_params)

    with utils.Timer() as training:
        estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)
    probabilities = estimator.predict_proba(
        X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.estimators_) + 1,
                  training_duration=training.duration)
Exemplo n.º 30
0
class Classifier(object):
    def __init__(self,
                 in_model_code,
                 db,
                 y_col="party",
                 label_col="county_fips",
                 where_clauses=None,
                 data_view="master_data",
                 year_col="year",
                 year_test=2020):
        self.db = db
        self.mc = in_model_code
        self.drop_cols = db.query(ModelDropCol).filter_by(
            model_code_id=self.mc.id).all()

        where = self.db.query(ModelWhereClause).filter_by(
            model_code=self.mc).all()
        if where:
            self.where = " where " + (" and ".join([wc.sql for wc in where]))
        else:
            self.where = ""

        self.engine_string = database_string
        self.query = f"select * from {data_view}{self.where}"
        self.df = pandas.read_sql_query(
            self.query,
            database_string).drop(columns=[dc.column for dc in self.drop_cols])

        self.y = self.df[y_col].to_numpy()
        self.x = self.df.drop(columns=y_col).to_numpy()

        self.model_obj = self.db.query(Model).filter_by(
            model_code=self.mc).first()
        if not self.model_obj:

            rf = RandomForestClassifier(n_estimators=10, random_state=42)
            svr = make_pipeline(
                StandardScaler(),
                LinearSVC(random_state=42, dual=False, max_iter=1000))
            knn = KNeighborsClassifier(n_neighbors=3)
            nb = GaussianNB()
            classifiers = [("rf", rf), ("svr", svr), ("knn", knn), ("nb", nb)]
            self.model = StackingClassifier(
                estimators=classifiers, final_estimator=LogisticRegression())
            self.accuracy = None
            self.model_obj = Model(model_code=self.mc, accuracy=self.accuracy)
            self.db.add(self.model_obj)
            self.train()
            self.save()
        else:
            self.model = pickle.loads(self.model_obj.model_object)
            self.accuracy = self.model_obj.accuracy

    def train(self):
        x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                            self.y,
                                                            test_size=0.33)
        self.model.fit(x_train, y_train)
        self.accuracy = self.model.score(x_test, y_test)

    def save(self):
        self.model_obj.model_object = pickle.dumps(self.model)
        self.model_obj.accuracy = self.accuracy
        self.db.commit()

    def predict(self, fips, in_file_path=None):
        """
        Currently hard coded to predict for 2020, or the latest election in which all data
        as available, but not trained on.
        """
        if "2020" in self.mc.id:
            raise IOError(
                "Must be a non-2020 model code to predict 2020 results.")
        year = 2020
        logging.info(f"Selecting {self.mc.id} model ({self.mc.description})")
        if fips in ["ALL", "*"]:
            and_clause = ""
            logging.info("Predicting all counties...")
            all_counties = True
        else:
            and_clause = f" and county_fips = {fips}"
            all_counties = False
        max_year = self.db.execute(
            f"select max(year) from ({self.query})").scalar()
        search_year = max_year - 4

        data = pandas.read_sql_query(
            f"select * from ({self.query}) where year = '{search_year}'{and_clause}",
            self.engine_string).drop(
                columns=[dc.column for dc in self.drop_cols])

        fields = list(data.columns)
        county_fips_idx = None
        for i, f in enumerate(fields):
            if f == "county_fips":
                county_fips_idx = i - 1
                break

        y = data["party"].to_numpy()
        x = data.drop(columns=["party"]).to_numpy()

        predictions = self.model.predict(x)
        out_predictions = []
        fips_to_county = {}
        logging.info("Predictions:")
        i = 0

        for val in x:
            pred = predictions[i]
            county_id = str(int(val[county_fips_idx])).zfill(6)
            if county_id in fips_to_county:
                county = fips_to_county[county_id]
            else:
                county = self.db.query(County).filter_by(id=county_id).first()
                fips_to_county[county_id] = county

            logging.info(f"{county.name} ({county.id}): {pred}")
            out_predictions.append({
                "party_prediction": pred,
                "county_fips": county_id,
                "county_name": county.name,
                "state_fips": county.state.id,
                "state_code": county.state.code
            })
            i += 1

        if in_file_path:
            logging.info(f"Writing output to {in_file_path}")
            out_cols = [
                "party_prediction", "county_fips", "county_name", "state_fips",
                "state_code"
            ]
            with open(in_file_path, "w") as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=out_cols)
                writer.writeheader()
                writer.writerows(out_predictions)
        return out_predictions