Exemplo n.º 1
0
def test_StackingEstimator_2():
    """Assert that the StackingEstimator returns transformed X with a synthetic feature in regression."""
    reg = RandomForestRegressor(random_state=42)
    stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42))
    # fit
    reg.fit(training_features_r, training_target_r)
    stack_reg.fit(training_features_r, training_target_r)
    # get transformd X
    X_reg_transformed = stack_reg.transform(training_features_r)

    assert np.allclose(reg.predict(training_features_r), X_reg_transformed[:, 0])
Exemplo n.º 2
0
def test_StackingEstimator_1():
    """Assert that the StackingEstimator returns transformed X with synthetic features in classification."""
    clf = RandomForestClassifier(random_state=42)
    stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42))
    # fit
    clf.fit(training_features, training_target)
    stack_clf.fit(training_features, training_target)
    # get transformd X
    X_clf_transformed = stack_clf.transform(training_features)

    assert np.allclose(clf.predict(training_features), X_clf_transformed[:, 0])
    assert np.allclose(clf.predict_proba(training_features), X_clf_transformed[:, 1:1 + len(np.unique(training_target))])
Exemplo n.º 3
0
def test_StackingEstimator_4():
    """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in regression."""
    stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42))
    meta_reg = Lasso(random_state=42)
    sklearn_pipeline = make_pipeline(stack_reg, meta_reg)
    # fit in pipeline
    sklearn_pipeline.fit(training_features_r, training_target_r)
    # fit step by step
    stack_reg.fit(training_features_r, training_target_r)
    X_reg_transformed = stack_reg.transform(training_features_r)
    meta_reg.fit(X_reg_transformed, training_target_r)
    # scoring
    score = meta_reg.score(X_reg_transformed, training_target_r)
    pipeline_score = sklearn_pipeline.score(training_features_r, training_target_r)
    assert np.allclose(score, pipeline_score)

    # test cv score
    cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features_r, training_target_r, cv=3, scoring='r2'))
    known_cv_score = 0.795877470354

    assert np.allclose(known_cv_score, cv_score)
Exemplo n.º 4
0
def test_StackingEstimator_3():
    """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in classification."""
    stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42))
    meta_clf = LogisticRegression()
    sklearn_pipeline = make_pipeline(stack_clf, meta_clf)
    # fit in pipeline
    sklearn_pipeline.fit(training_features, training_target)
    # fit step by step
    stack_clf.fit(training_features, training_target)
    X_clf_transformed = stack_clf.transform(training_features)
    meta_clf.fit(X_clf_transformed, training_target)
    # scoring
    score = meta_clf.score(X_clf_transformed, training_target)
    pipeline_score = sklearn_pipeline.score(training_features, training_target)
    assert np.allclose(score, pipeline_score)

    # test cv score
    cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features, training_target, cv=3, scoring='accuracy'))

    known_cv_score = 0.947282375315

    assert np.allclose(known_cv_score, cv_score)
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-15.707779240894274
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.75,
                                                          learning_rate=0.001,
                                                          loss="quantile",
                                                          max_depth=2,
                                                          max_features=0.2,
                                                          min_samples_leaf=14,
                                                          min_samples_split=12,
                                                          n_estimators=100,
                                                          subsample=0.5)),
    XGBRegressor(learning_rate=0.01,
                 max_depth=2,
                 min_child_weight=9,
                 n_estimators=100,
                 nthread=1,
                 subsample=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
X = X[:, 1:]
pd.DataFrame(X).to_csv("./Datasets/X.csv")
pd.DataFrame(y).to_csv("./Datasets/y.csv")
Y = pd.read_csv('./Datasets/y.csv')
Y.drop('Unnamed: 0', axis=1, inplace=True)

tpot_data = pd.read_csv('./Datasets/X.csv', sep=',', dtype=np.float64)
#features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(X, Y, random_state=42)

# Average CV score on the training set was: -0.00021627142164234252
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LinearSVR(C=0.5,
                                          dual=False,
                                          epsilon=0.1,
                                          loss="squared_epsilon_insensitive",
                                          tol=0.1)), StandardScaler(),
    StackingEstimator(
        estimator=RandomForestRegressor(bootstrap=False,
                                        max_features=0.6500000000000001,
                                        min_samples_leaf=1,
                                        min_samples_split=2,
                                        n_estimators=100)),
    RandomForestRegressor(bootstrap=False,
                          max_features=0.8,
                          min_samples_leaf=2,
                          min_samples_split=10,
                          n_estimators=410))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)
Exemplo n.º 7
0
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.7648445932689355
exported_pipeline = make_pipeline(
    make_union(
        make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
        StackingEstimator(estimator=make_pipeline(
            make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
            LGBMClassifier(learning_rate=0.01193776641714437,
                           max_depth=4,
                           n_estimators=1122,
                           random_state=42)))),
    LGBMClassifier(learning_rate=0.026797460873256924,
                   max_depth=3,
                   n_estimators=216,
                   random_state=42))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-2245.3301019020714
exported_pipeline = make_pipeline(
    StackingEstimator(
        estimator=RandomForestRegressor(bootstrap=False,
                                        max_features=0.7500000000000001,
                                        min_samples_leaf=3,
                                        min_samples_split=3,
                                        n_estimators=100)),
    ExtraTreesRegressor(bootstrap=False,
                        max_features=0.6000000000000001,
                        min_samples_leaf=2,
                        min_samples_split=4,
                        n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 9
0
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=2019)

# Average CV score on the training set was:0.9077543142597638
exported_pipeline = make_pipeline(
    StackingEstimator(
        estimator=GradientBoostingClassifier(learning_rate=0.1,
                                             max_depth=3,
                                             max_features=0.15000000000000002,
                                             min_samples_leaf=5,
                                             min_samples_split=15,
                                             n_estimators=100,
                                             subsample=0.5)),
    LinearSVC(C=20.0, dual=False, loss="squared_hinge", penalty="l1",
              tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-17.58292391242326
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=make_pipeline(
            StackingEstimator(
                estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)),
            StackingEstimator(estimator=GradientBoostingRegressor(
                alpha=0.85,
                learning_rate=1.0,
                loss="lad",
                max_depth=4,
                max_features=0.6000000000000001,
                min_samples_leaf=11,
                min_samples_split=20,
                n_estimators=100,
                subsample=0.1)), LassoLarsCV(normalize=True))),
        FunctionTransformer(copy)),
    LinearSVR(C=5.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9932996110655544
exported_pipeline = make_pipeline(
    RFE(estimator=ExtraTreesClassifier(criterion="gini",
                                       max_features=0.55,
                                       n_estimators=100),
        step=0.15000000000000002),
    StackingEstimator(
        estimator=LogisticRegression(C=1.0, dual=False, penalty="l2")),
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False,
                                                       criterion="gini",
                                                       max_features=0.8,
                                                       min_samples_leaf=6,
                                                       min_samples_split=14,
                                                       n_estimators=100)),
    KNeighborsClassifier(n_neighbors=2, p=2, weights="distance"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 12
0
from tpot.builtins import StackingEstimator, ZeroCount
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer

# NOTE: Make sure that the class is labeled 'target' in the data file
import competitions
d = competitions.get_data()
tpot_data = d.data
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-747046.8597394783
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=1.0, tol=0.001)),
    FastICA(tol=0.8),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True,
                                                    max_features=0.5,
                                                    min_samples_leaf=14,
                                                    min_samples_split=11,
                                                    n_estimators=100)),
    ZeroCount(), MaxAbsScaler(), LassoLarsCV(normalize=False))

# exported_pipeline = TransformedTargetRegressor(regressor=exported_pipeline, transformer=QuantileTransformer(output_distribution='normal'))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
train_results = exported_pipeline.predict(training_features)
from pylab import *
Exemplo n.º 13
0
def train_model(X_train, y_train):
    #model = LogisticRegression()
    #model = RandomForestClassifier()
    #model = ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=4, min_samples_split=15)
    #model = GradientBoostingClassifier(max_depth=2, max_features=0.25, min_samples_leaf=13, min_samples_split=15, n_estimators=100, subsample=0.4)
    #model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=16, min_samples_split=10)
    #model = RandomForestClassifier(bootstrap=False, max_features=0.6500000000000001, min_samples_leaf=9, min_samples_split=19)

    model = VotingClassifier(
        estimators=
        [('lr', LogisticRegression()),
         ('et',
          ExtraTreesClassifier(bootstrap=False,
                               criterion="gini",
                               max_features=0.9000000000000001,
                               min_samples_leaf=4,
                               min_samples_split=15)),
         ('gb',
          GradientBoostingClassifier(max_depth=2,
                                     max_features=0.25,
                                     min_samples_leaf=13,
                                     min_samples_split=15,
                                     n_estimators=100,
                                     subsample=0.4)),
         ('dt',
          DecisionTreeClassifier(max_depth=7,
                                 min_samples_leaf=16,
                                 min_samples_split=10)),
         ('rf',
          RandomForestClassifier(bootstrap=False,
                                 max_features=0.6500000000000001,
                                 min_samples_leaf=9,
                                 min_samples_split=19)),
         ('gb2',
          GradientBoostingClassifier(learning_rate=0.01,
                                     max_depth=2,
                                     max_features=0.8,
                                     min_samples_leaf=11,
                                     min_samples_split=10,
                                     subsample=0.7000000000000001)),
         ('gb3',
          GradientBoostingClassifier(max_depth=7,
                                     max_features=0.15000000000000002,
                                     min_samples_leaf=5,
                                     min_samples_split=17,
                                     n_estimators=100,
                                     subsample=0.6500000000000001)),
         ('pip1',
          make_pipeline(
              StackingEstimator(
                  estimator=LinearSVC(
                      dual=False, loss="squared_hinge", tol=1e-05)),
              StandardScaler(
              ),
              DecisionTreeClassifier(criterion="entropy",
                                     max_depth=6,
                                     min_samples_leaf=7,
                                     min_samples_split=9)))
         #('rf2', RandomForestClassifier(criterion="entropy", max_features=0.25, min_samples_split=8, n_estimators=100))
         ],
        voting='hard')

    model.fit(X_train, y_train)
    return model
Exemplo n.º 14
0
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 1.0
exported_pipeline = make_pipeline(
    StackingEstimator(
        estimator=MLPClassifier(alpha=0.001, learning_rate_init=0.001)),
    GradientBoostingClassifier(learning_rate=0.1,
                               max_depth=7,
                               max_features=1.0,
                               min_samples_leaf=1,
                               min_samples_split=9,
                               n_estimators=100,
                               subsample=0.9500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 15
0
# Average CV score on the training set was: -6.981145679172217
exported_pipeline = make_pipeline(
    make_union(
        make_union(
            FunctionTransformer(copy),
            make_pipeline(
                make_union(FunctionTransformer(copy),
                           FunctionTransformer(copy)),
                FeatureAgglomeration(affinity="euclidean", linkage="average"),
                SelectPercentile(score_func=f_regression, percentile=15),
                RBFSampler(gamma=0.8500000000000001))),
        SelectFwe(score_func=f_regression, alpha=0.005)),
    StackingEstimator(
        estimator=ExtraTreesRegressor(bootstrap=False,
                                      max_features=0.7000000000000001,
                                      min_samples_leaf=7,
                                      min_samples_split=11,
                                      n_estimators=100)),
    ExtraTreesRegressor(bootstrap=False,
                        max_features=0.9000000000000001,
                        min_samples_leaf=2,
                        min_samples_split=5,
                        n_estimators=100))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(test_data)
export_test_to_csv(predictions=results)

print(len(results))
Exemplo n.º 16
0
from xgboost import XGBClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9104161420758785
exported_pipeline = make_pipeline(
    RobustScaler(),
    StackingEstimator(estimator=SGDClassifier(alpha=0.0,
                                              eta0=0.1,
                                              fit_intercept=False,
                                              l1_ratio=0.75,
                                              learning_rate="invscaling",
                                              loss="hinge",
                                              penalty="elasticnet",
                                              power_t=100.0)),
    XGBClassifier(learning_rate=0.1,
                  max_depth=7,
                  min_child_weight=8,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.6000000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=7)

# Average CV score on the training set was: 0.8093557422969188
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False,
                                                     criterion="gini",
                                                     max_features=0.05,
                                                     min_samples_leaf=8,
                                                     min_samples_split=12,
                                                     n_estimators=100)),
    ExtraTreesClassifier(bootstrap=True,
                         criterion="entropy",
                         max_features=0.6000000000000001,
                         min_samples_leaf=20,
                         min_samples_split=13,
                         n_estimators=100))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 7)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 18
0
def main():
    #读数据
    train = pd.read_csv("../data/processed/train.csv")
    test = pd.read_csv("../data/processed/test.csv")
    train.pop("id")
    test.pop("id")
    target = train.pop("血糖")

    train_x = train.as_matrix()
    train_y = target.as_matrix()
    test_x = test.as_matrix()

    high_labels = np.zeros((train_y.shape[0], ))
    for i in range(train_y.shape[0]):
        if train_y[i] < 11.2:  #训练集的高值判断
            high_labels[i] = 1
        else:
            high_labels[i] = -1
    #预测结果取5次平均
    N = 5
    kf = KFold(n_splits=N, random_state=42)
    i = 0
    result_mean = 0.0
    test_preds = np.zeros((test_x.shape[0], N))
    #构造一个用于存储异常值的字典,存储方式:id:[].例如{938:[14,14,14],314:[13]}
    outlier = {}
    for train_index, test_index in kf.split(train_x):
        training_features, training_target = train_x[train_index], train_y[
            train_index]
        testing_features, testing_target = train_x[test_index], train_y[
            test_index]
        #构造模型,预测血糖值
        exported_pipeline = Pipeline([
            ("scaler", MaxAbsScaler()),
            ("SVR",
             StackingEstimator(
                 estimator=LinearSVR(C=0.01,
                                     dual=False,
                                     epsilon=1.0,
                                     loss="squared_epsilon_insensitive",
                                     tol=0.001))),
            ("RidgeCV", StackingEstimator(estimator=RidgeCV())),
            # ("LGB", StackingEstimator(estimator=lgb.LGBMRegressor(objective='regression',
            #                           boosting_type="GBDT",
            #                           num_leaves=31,
            #                           learning_rate=0.01,
            #                           feature_fraction=0.5,
            #                           bagging_fraction=0.5,
            #                           bagging_freq=5,
            #                           n_estimators=400))),
            ("XGB",
             XGBRegressor(max_depth=8,
                          n_estimators=200,
                          colsample_bytree=0.8,
                          subsample=0.8,
                          tweedie_variance_power=1.4,
                          eta=0.01,
                          booster="gbtree",
                          random_state=1015,
                          gamma=1,
                          silent=1,
                          min_child_weight=5,
                          objective="reg:tweedie",
                          n_jobs=-1))
        ])

        exported_pipeline.fit(training_features, training_target)

        test_pred = exported_pipeline.predict(test_x)

        #预测异常值
        high_results, pred_high_list = modif_value(
            training_features, high_labels[train_index], test_x,
            train_x[np.where(high_labels == -1)[0]],
            train_y[np.where(high_labels == -1)[0]])
        #存储异常值
        if len(high_results) != 0 and len(pred_high_list) != 0:
            for ii, jj in enumerate(pred_high_list):
                if jj not in outlier:
                    outlier[jj] = []
                outlier[jj].append(high_results[ii])
        for index, value in zip(high_results, pred_high_list):
            print(index, value)

        # 线下CV
        testing_results = exported_pipeline.predict(testing_features)
        # 改值
        cv_high_results, cv_pred_high_list = modif_value(
            training_features, high_labels[train_index], testing_features,
            train_x[np.where(high_labels == -1)[0]],
            train_y[np.where(high_labels == -1)[0]])

        if len(cv_high_results) != 0 and len(cv_pred_high_list) != 0:
            for ii, jj in enumerate(cv_pred_high_list):
                testing_results[jj] = cv_high_results[ii]

        result_mean += np.round(
            mean_squared_error(testing_target, testing_results), 5)
        print(
            'CV_ROUND (', i, ') mse -> ',
            np.round(mean_squared_error(testing_target, testing_results), 5) /
            2)

        test_preds[:, i] = test_pred
        i += 1
    results = test_preds.mean(axis=1)

    #修改异常值
    for index in outlier:
        print(index, outlier[index])
        results[index] = max(outlier[index])

    # 线下CV
    result_mean /= N
    print("offline CV Mean squared error: %.5f" % (result_mean / 2))

    ouput = pd.DataFrame()
    ouput[0] = results
    #ouput.to_csv("../result/1.25-WQX-PolyFeatures.csv", header=None, index=False, encoding="utf-8")
    # ouput.to_csv(r'../result/test{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),
    #            header=None,index=False, float_format='%.4f')
    # save(ouput, 'xgb_class')
    print(ouput.describe())
    print(ouput.loc[ouput[0] > 8])
Exemplo n.º 19
0
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8291955789781877
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
            StackingEstimator(
                estimator=LogisticRegression(C=0.1, dual=True, penalty="l2")),
            SelectPercentile(score_func=f_classif, percentile=78)),
        make_pipeline(
            make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
            SelectPercentile(score_func=f_classif, percentile=46),
            Binarizer(threshold=0.1))), StandardScaler(),
    LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 20
0
from sklearn.preprocessing import Normalizer
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8666666666666668
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1,
                                              max_depth=3,
                                              min_child_weight=4,
                                              n_estimators=100,
                                              nthread=1,
                                              subsample=0.1)),
    Normalizer(norm="l1"), ZeroCount(),
    RandomForestClassifier(bootstrap=False,
                           criterion="entropy",
                           max_features=0.1,
                           min_samples_leaf=11,
                           min_samples_split=20,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 21
0
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=1234)

# Average CV score on the training set was:0.7821728129678194
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=False)),
    VarianceThreshold(threshold=0.0005),
    GradientBoostingRegressor(alpha=0.8,
                              learning_rate=0.1,
                              loss="lad",
                              max_depth=4,
                              max_features=0.25,
                              min_samples_leaf=12,
                              min_samples_split=10,
                              n_estimators=100,
                              subsample=0.9000000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 22
0

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
data = pd.read_csv(pathlib.Path(__file__).parent.absolute().__str__() + '/student-mat.csv', sep=';')
data['target'] = data['G3']
data.drop(columns='G3', inplace=True)
features = data.drop('target', axis=1).select_dtypes([np.number])
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, data['target'], random_state=25)
features = features.dtypes.to_dict()

# Instantiate model
model = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        StackingEstimator(estimator=RidgeCV())
    ),
    XGBRegressor(learning_rate=0.1, max_depth=2, min_child_weight=9, n_estimators=1000, nthread=1,
                 objective="reg:squarederror", subsample=0.35000000000000003)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(model.steps, 'random_state', 25)
Model = model.fit(training_features, training_target)

# Building Metrics
testing_pred = Model.predict(testing_features)
score = model.score(testing_features, testing_target)
mse = MSE(testing_target, testing_pred)
rmse = mse**(1/2)
max_error = max_error(testing_target, testing_pred)
eval_metrics_dict = {'r2':score, 'mse': mse, 'rmse': rmse, 'max_error': max_error}
Exemplo n.º 23
0
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.9619047619047618
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=MultinomialNB(alpha=0.01, fit_prior=False)),
    PCA(iterated_power=10, svd_solver="randomized"), GaussianNB())
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -4.3217232470775455
exported_pipeline = make_pipeline(
    MinMaxScaler(),
    StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.0001)),
    StackingEstimator(estimator=LinearSVR(C=20.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)),
    RidgeCV()
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeRegressor
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-24608.83594657029
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=3, min_samples_leaf=20, min_samples_split=3)),
    DecisionTreeRegressor(max_depth=10,
                          min_samples_leaf=9,
                          min_samples_split=4))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:-0.002941894277857136
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.85, learning_rate=0.01, loss="lad", max_depth=2, max_features=0.15000000000000002, min_samples_leaf=7, min_samples_split=7, n_estimators=100, subsample=0.4)),
    MinMaxScaler(),
    StackingEstimator(estimator=LinearSVR(C=1.0, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=1e-05)),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    RandomForestRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=6, min_samples_split=3, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.991096262627164
exported_pipeline = make_pipeline(
    RFE(estimator=ExtraTreesClassifier(criterion="gini",
                                       max_features=0.6000000000000001,
                                       n_estimators=100),
        step=0.15000000000000002),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="entropy",
                                                       max_depth=8,
                                                       min_samples_leaf=4,
                                                       min_samples_split=8)),
    KNeighborsClassifier(n_neighbors=1, p=2, weights="uniform"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
##merge the 2 datasets on sample
data = pd.merge(meta, shared, on=['sample'])
##remove adenoma samples
data = data[data.dx.str.contains("adenoma") == False]
data.rename(columns={'dx': 'class'}, inplace=True)
x = data.drop(["sample", "class", "numOtus", "label"], axis=1)
diagnosis = {"cancer": 1, "normal": 0}
y = data["class"].replace(diagnosis)
y.dropna()
x.dropna()

# Score on the training set was:0.8492612704601008
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True,
                                                       criterion="gini",
                                                       max_features=0.8,
                                                       min_samples_leaf=2,
                                                       min_samples_split=2,
                                                       n_estimators=100)),
    StackingEstimator(
        estimator=RandomForestClassifier(bootstrap=True,
                                         criterion="entropy",
                                         max_features=0.6000000000000001,
                                         min_samples_leaf=8,
                                         min_samples_split=7,
                                         n_estimators=100)),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini",
                                                       max_depth=3,
                                                       min_samples_leaf=13,
                                                       min_samples_split=10)),
    RFE(estimator=ExtraTreesClassifier(criterion="entropy",
                                       max_features=0.6500000000000001,
Exemplo n.º 29
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.9800000000000001
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=GaussianNB()),
    MultinomialNB(alpha=10.0, fit_prior=True)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 30
0
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.2532849505281343
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=89),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=50,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
Exemplo n.º 31
0
def main():
    train = pd.read_csv("../data/processed/train.csv")
    train.pop("id")
    target = train.pop("血糖")
    train_x = train.as_matrix()
    train_y = target.as_matrix()

    N = 5
    kf = KFold(n_splits=N, random_state=42)
    result_mean = 0.0
    for train_index, test_index in kf.split(train_x):
        training_features, training_target = train_x[train_index], train_y[
            train_index]
        testing_features, testing_target = train_x[test_index], train_y[
            test_index]

        scaler = MaxAbsScaler()
        scaler.fit(training_features)

        training_features = scaler.transform(training_features)
        testing_features = scaler.transform(testing_features)

        knn = KNeighborsRegressor(n_neighbors=9, p=1, weights="distance")
        linear_svr = LinearSVR(C=0.01,
                               dual=False,
                               epsilon=1.0,
                               loss="squared_epsilon_insensitive",
                               tol=0.001)
        ridge = RidgeCV()
        gbm = lgb.LGBMRegressor(objective='regression',
                                boosting_type="GBDT",
                                num_leaves=17,
                                learning_rate=0.01,
                                feature_fraction=0.5,
                                bagging_fraction=0.5,
                                bagging_freq=5,
                                reg_alpha=0.1,
                                reg_lambda=0.5,
                                n_estimators=400)
        lr = LinearRegression()
        en = ElasticNetCV(l1_ratio=0.1, tol=0.01)
        xgb = XGBRegressor(learning_rate=0.01,
                           max_depth=8,
                           min_child_weight=8,
                           n_estimators=100,
                           nthread=1,
                           subsample=0.15000000000000002)
        et = ExtraTreesRegressor(bootstrap=True,
                                 max_features=0.35000000000000003,
                                 min_samples_leaf=3,
                                 min_samples_split=12,
                                 n_estimators=100)

        rf = RandomForestRegressor(bootstrap=True,
                                   max_features=0.9500000000000001,
                                   min_samples_leaf=15,
                                   min_samples_split=6,
                                   n_estimators=100)

        exported_pipeline0 = make_pipeline(
            StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.1, tol=0.01)),
            ExtraTreesRegressor(bootstrap=False,
                                max_features=0.2,
                                min_samples_leaf=3,
                                min_samples_split=16,
                                n_estimators=100))

        exported_pipeline1 = Pipeline([
            ("SVR",
             StackingEstimator(
                 estimator=LinearSVR(C=0.01,
                                     dual=False,
                                     epsilon=1.0,
                                     loss="squared_epsilon_insensitive",
                                     tol=0.001))),
            ("RidgeCV", StackingEstimator(estimator=RidgeCV())),
            ("LGB",
             lgb.LGBMRegressor(objective='regression',
                               boosting_type="GBDT",
                               num_leaves=17,
                               learning_rate=0.01,
                               feature_fraction=0.5,
                               bagging_fraction=0.5,
                               bagging_freq=5,
                               reg_alpha=0.1,
                               reg_lambda=0.5,
                               n_estimators=400))
        ])

        exported_pipeline2 = make_pipeline(
            StackingEstimator(estimator=RidgeCV()),
            StackingEstimator(
                estimator=XGBRegressor(learning_rate=0.01,
                                       max_depth=8,
                                       min_child_weight=8,
                                       n_estimators=100,
                                       nthread=1,
                                       subsample=0.15000000000000002)),
            ExtraTreesRegressor(bootstrap=True,
                                max_features=0.35000000000000003,
                                min_samples_leaf=3,
                                min_samples_split=12,
                                n_estimators=100))

        stack = Ensemble(n_splits=10,
                         stacker=LinearRegression(),
                         base_models=(rf, knn, lr, linear_svr, ridge, en, gbm,
                                      xgb, et, exported_pipeline0,
                                      exported_pipeline1, exported_pipeline2))

        results = stack.fit_predict(X=training_features,
                                    y=training_target,
                                    T=testing_features)
        result_mean += np.round(mean_squared_error(testing_target, results), 5)

    result_mean /= (N)
    print("Mean squared error: %.5f" % (result_mean / 2))
import numpy as np

from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)), StandardScaler(),
    MinMaxScaler(), KNeighborsRegressor(n_neighbors=52,
                                        p=1,
                                        weights="distance"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 33
0
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.982083588281725
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1,
                                             max_depth=2,
                                             min_child_weight=8,
                                             n_estimators=300,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.1,
                                             verbosity=0)), MinMaxScaler(),
    LinearSVR(C=20.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 34
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.6111975314359399
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=0.01, fit_prior=True)),
    LinearSVC(C=20.0,
              dual=False,
              loss="squared_hinge",
              penalty="l1",
              tol=0.0001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)