Пример #1
0
def test_ZeroCount():
    """Assert that ZeroCount operator returns correct transformed X."""
    op = ZeroCount()
    X_transformed = op.transform(X)
    zero_col = np.array([3, 2, 1, 4])
    non_zero = np.array([2, 3, 4, 1])

    assert np.allclose(zero_col, X_transformed[:, 0])
    assert np.allclose(non_zero, X_transformed[:, 1])
Пример #2
0
def test_ZeroCount():
    """Assert that ZeroCount operator returns correct transformed X."""
    op = ZeroCount()
    X_transformed = op.transform(X)
    zero_col = np.array([3, 2, 1, 4])
    non_zero = np.array([2, 3, 4, 1])

    assert np.allclose(zero_col, X_transformed[:, 0])
    assert np.allclose(non_zero, X_transformed[:, 1])
Пример #3
0
def get_model_v4():
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=2, min_child_weight=17, n_estimators=100, nthread=1, subsample=0.8)),
        ZeroCount(),
        VarianceThreshold(threshold=0.2),
        RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.15000000000000002, n_estimators=100), step=0.2),
        GradientBoostingClassifier(learning_rate=0.5, max_depth=7, max_features=0.15000000000000002, min_samples_leaf=2, min_samples_split=3, n_estimators=100, subsample=1.0)
    )
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 37)
    return exported_pipeline
 def pipeline_suggested_by_tpot(self):
     # Copied from optimal pipeline suggested by tpot in file "optimal_pipeline.py"
     # Initialize 
     exported_pipeline = make_pipeline(
                     PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
                     VarianceThreshold(threshold=0.2),
                     ZeroCount(),
                     GradientBoostingClassifier(learning_rate=1.0, max_depth=10, max_features=0.9000000000000001, min_samples_leaf=16, min_samples_split=3, n_estimators=100, subsample=0.7000000000000001)
                     )
     # Init training
     exported_pipeline.fit(self.x_train, self.y_train)
     
     print(f"Train acc: {exported_pipeline.score(self.x_train, self.y_train)}")
     print(f"Test acc: {exported_pipeline.score(self.x_test, self.y_test)}")
def clf(in_put, out_put):
    from sklearn.decomposition import PCA
    from sklearn.linear_model import LassoLarsCV
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.svm import LinearSVR
    from tpot.builtins import StackingEstimator, ZeroCount

    
    exported_pipeline = make_pipeline(
            PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
            StackingEstimator(estimator=LinearSVR(C=15.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=1e-05)),
            PCA(iterated_power=7, svd_solver="randomized"),
            ZeroCount(),
            LassoLarsCV(normalize=True)
            )

    exported_pipeline.fit(in_put, out_put)
    results = exported_pipeline.predict(in_put)
    
    return results
Пример #6
0
    def __init__(self):
        #On lit le fichier CSV grace a pd qui contient les memes données que dans la base de donnée
        tpot_data = pd.read_csv('cryptodata.csv', sep=',')
        #On s'interesse que au prix moyen du DASH, de son volume google trend et son sentiment twitter pour notre Variable X
        X = tpot_data[tpot_data["symbol"] == "DASH"][[
            "price_ave", "volume", "google_trend", "twitter_sent"
        ]].values
        #Pour la variable Y on s'interesse seulement a la variable prix
        y = tpot_data[tpot_data["symbol"] == "DASH"][["price"]].values
        training_features, testing_features, training_target, testing_target = \
                                            train_test_split(X, y, random_state=42)
        self.__std = stdev([item[0] for item in y])

        # Le score sur l'ensemble de formation était:-6.2249531865813035
        self.exported_pipeline = make_pipeline(
            VarianceThreshold(threshold=0.05), ZeroCount(),
            PCA(iterated_power=1, svd_solver="randomized"),
            StackingEstimator(estimator=ElasticNetCV(
                l1_ratio=0.8500000000000001, tol=0.001)),
            StackingEstimator(
                estimator=LinearSVR(C=20.0,
                                    dual=False,
                                    epsilon=0.01,
                                    loss="squared_epsilon_insensitive",
                                    tol=0.01)),
            ExtraTreesRegressor(bootstrap=False,
                                max_features=0.8,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                n_estimators=100))

        self.exported_pipeline.fit(training_features, training_target.ravel())
        self.y_predict = self.exported_pipeline.predict(testing_features)
        self.y_real = testing_target.ravel()
        self.score = self.exported_pipeline.score(testing_features,
                                                  testing_target)
Пример #7
0
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=1)

# Average CV score on the training set was: 0.9347254053136407
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    VarianceThreshold(threshold=0.2), ZeroCount(),
    GradientBoostingClassifier(learning_rate=1.0,
                               max_depth=10,
                               max_features=0.9000000000000001,
                               min_samples_leaf=16,
                               min_samples_split=3,
                               n_estimators=100,
                               subsample=0.7000000000000001))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #8
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector, ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=39)

# Average CV score on the training set was:0.6838260869565218
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=4, subset_list="module23.csv"), ZeroCount(),
    ExtraTreesClassifier(bootstrap=True,
                         criterion="entropy",
                         max_features=0.6500000000000001,
                         min_samples_leaf=14,
                         min_samples_split=18,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #9
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8686359265648864
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=0.1, fit_prior=True)),
    StackingEstimator(estimator=GaussianNB()), ZeroCount(),
    RandomForestClassifier(bootstrap=True,
                           criterion="gini",
                           max_features=0.4,
                           min_samples_leaf=12,
                           min_samples_split=3,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from tpot.builtins import StackingEstimator, ZeroCount
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9837855826340283
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    RobustScaler(), ZeroCount(),
    GaussianProcessRegressor(kernel=Matern(length_scale=4.3999999999999995,
                                           nu=2.5),
                             n_restarts_optimizer=60,
                             normalize_y=False))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #11
0
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8336787977583366
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                make_union(Normalizer(norm="l2"), MaxAbsScaler()),
                StackingEstimator(
                    estimator=DecisionTreeClassifier(criterion="entropy",
                                                     max_depth=2,
                                                     min_samples_leaf=11,
                                                     min_samples_split=17))),
            StackingEstimator(
                estimator=MLPClassifier(alpha=0.01, learning_rate_init=0.001)),
            SelectPercentile(score_func=f_classif, percentile=54)),
        FunctionTransformer(copy)), ZeroCount(),
    StackingEstimator(estimator=BernoulliNB(alpha=1.0, fit_prior=True)),
    MultinomialNB(alpha=0.01, fit_prior=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #12
0
from metstab_shap.data import load_data

# load data (and change to classification if needed)
data_cfg = parse_data_config('configs/data/rat.cfg')
repr_cfg = parse_representation_config('configs/repr/maccs.cfg')
task_cfg = parse_task_config('configs/task/regression.cfg')
x, y, _, test_x, test_y, smiles, test_smiles = load_data(
    data_cfg, **repr_cfg[utils_section])

training_features = x
training_target = y
testing_features = test_x

# Average CV score on the training set was: -0.15289999993179348
exported_pipeline = make_pipeline(
    ZeroCount(), MinMaxScaler(),
    StackingEstimator(estimator=DecisionTreeRegressor(max_depth=5,
                                                      max_features=0.25,
                                                      min_samples_leaf=3,
                                                      min_samples_split=14,
                                                      splitter="best")),
    StackingEstimator(
        estimator=ExtraTreesRegressor(bootstrap=False,
                                      max_depth=4,
                                      max_features=0.7500000000000001,
                                      max_samples=None,
                                      min_samples_leaf=1,
                                      min_samples_split=10,
                                      n_estimators=1000)),
    Binarizer(threshold=0.9),
    ExtraTreesRegressor(bootstrap=False,
Пример #13
0
from sklearn.preprocessing import Normalizer
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8666666666666668
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1,
                                              max_depth=3,
                                              min_child_weight=4,
                                              n_estimators=100,
                                              nthread=1,
                                              subsample=0.1)),
    Normalizer(norm="l1"), ZeroCount(),
    RandomForestClassifier(bootstrap=False,
                           criterion="entropy",
                           max_features=0.1,
                           min_samples_leaf=11,
                           min_samples_split=20,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #14
0
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('../../input/train.csv',
                          delimiter=',',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(ZeroCount(), LassoLarsCV(normalize=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=100,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
                                             power_t=0.0)),
    StackingEstimator(estimator=LinearSVR(
        C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive",
        tol=0.0001)),
    FeatureAgglomeration(affinity="manhattan", linkage="complete"),
    SelectPercentile(score_func=f_regression, percentile=6),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False,
                                                    max_features=0.8,
                                                    min_samples_leaf=19,
                                                    min_samples_split=10,
                                                    n_estimators=400)),
    ZeroCount(), FeatureAgglomeration(affinity="l1", linkage="complete"),
    RidgeCV())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8453186610518303
exported_pipeline = make_pipeline(
    make_union(make_pipeline(ZeroCount(), FastICA(tol=0.2)),
               FunctionTransformer(copy)),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="entropy",
                         max_features=0.2,
                         min_samples_leaf=1,
                         min_samples_split=4,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #17
0
knr_params3 = {'n_neighbors' : 15}

knr_params4 = {'n_neighbors' : 25}

SEED = 0

level_1_models = [XgbWrapper(seed=SEED, params=xgb_params, cv_fold=4), 
                 ]
                
# level_1_models = level_1_models + [SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params1),
#                  SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params2),
#                  SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params3),
#                  SklearnWrapper(clf=KNeighborsRegressor,  params=knr_params4)]

level_1_models = level_1_models + [SklearnWrapper(make_pipeline( ZeroCount(), LassoLarsCV(normalize=True))),#LB 0.55797
                 SklearnWrapper(make_pipeline(StackingEstimator(estimator=LassoLarsCV(normalize=True)),
                 StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001,
                 loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18,
                 min_samples_split=14, subsample=0.7)), LassoLarsCV()))
                                  ]

params_list = [rf_params1, rf_params2, et_params1, et_params2, gb_params1, #gb_params2, 
               rd_params, ls_params, 
               eln_params, 
               lcv_params,
               llcv_params
               ]
   

func_list = [RandomForestRegressor, RandomForestRegressor, ExtraTreesRegressor, ExtraTreesRegressor, 
Пример #18
0
def test_ZeroCount_fit():
    """Assert that fit() in ZeroCount does nothing."""
    op = ZeroCount()
    ret_op = op.fit(X)

    assert ret_op==op
Пример #19
0
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-747046.8597394783
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=1.0, tol=0.001)),
    FastICA(tol=0.8),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True,
                                                    max_features=0.5,
                                                    min_samples_leaf=14,
                                                    min_samples_split=11,
                                                    n_estimators=100)),
    ZeroCount(), MaxAbsScaler(), LassoLarsCV(normalize=False))

# exported_pipeline = TransformedTargetRegressor(regressor=exported_pipeline, transformer=QuantileTransformer(output_distribution='normal'))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
train_results = exported_pipeline.predict(training_features)
from pylab import *
# figure(1)
# clf()
# ion()
# plot(training_target, train_results, 'bo')
# plot(testing_target, results, 'ro')
# show()

competitions.plot_predict(exported_pipeline)
    RobustScaler(), MinMaxScaler(),
    StackingEstimator(estimator=LinearSVR(C=25.0,
                                          dual=True,
                                          epsilon=0.01,
                                          loss="epsilon_insensitive",
                                          tol=0.0001)),
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=8, min_samples_leaf=17, min_samples_split=9)),
    FeatureAgglomeration(affinity="l2", linkage="average"),
    RBFSampler(gamma=0.75),
    StackingEstimator(estimator=LinearSVR(C=1.0,
                                          dual=True,
                                          epsilon=1.0,
                                          loss="squared_epsilon_insensitive",
                                          tol=0.1)),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=9, p=1, weights="uniform")),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    SelectPercentile(score_func=f_regression, percentile=26), StandardScaler(),
    PCA(iterated_power=7, svd_solver="randomized"),
    StackingEstimator(estimator=LinearSVR(C=10.0,
                                          dual=True,
                                          epsilon=0.01,
                                          loss="squared_epsilon_insensitive",
                                          tol=1e-05)), ZeroCount(),
    SelectFwe(score_func=f_regression, alpha=0.039),
    PCA(iterated_power=5, svd_solver="randomized"), RidgeCV())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #21
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector, ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=96)

# Average CV score on the training set was:0.7070745272525027
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"), ZeroCount(),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="gini",
                         max_features=0.8,
                         min_samples_leaf=1,
                         min_samples_split=8,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #22
0
def test_ZeroCount_fit():
    """Assert that fit() in ZeroCount does nothing."""
    op = ZeroCount()
    ret_op = op.fit(X)

    assert ret_op == op
Пример #23
0
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.6933333333333334
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.0005),
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1,
                                              max_depth=1,
                                              min_child_weight=17,
                                              n_estimators=100,
                                              nthread=1,
                                              subsample=0.6000000000000001)),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini",
                                                       max_depth=2,
                                                       min_samples_leaf=19,
                                                       min_samples_split=14)),
    StackingEstimator(estimator=BernoulliNB(alpha=1.0, True)), ZeroCount(),
    GradientBoostingClassifier(learning_rate=0.01,
                               max_depth=4,
                               max_features=0.8500000000000001,
                               min_samples_leaf=6,
                               min_samples_split=15,
                               n_estimators=100,
                               subsample=0.6500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.504247990815155
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), ZeroCount()),
    XGBClassifier(learning_rate=0.001,
                  max_depth=3,
                  min_child_weight=3,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #25
0
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv("data.csv")
features = tpot_data.drop('PSL_Won', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['PSL_Won'], random_state=42)

# Average CV score on the training set was: 0.8671594508975712
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_classif, alpha=0.011), ZeroCount(),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="entropy",
                         max_features=1.0,
                         min_samples_leaf=7,
                         min_samples_split=20,
                         n_estimators=100))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

## Plot
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, mean_squared_error
Пример #26
0
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, StandardScaler
from tpot.builtins import ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.8591858482523443
exported_pipeline = make_pipeline(
    StandardScaler(), ZeroCount(),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    RobustScaler(),
    GradientBoostingClassifier(learning_rate=1.0,
                               max_depth=6,
                               max_features=0.45,
                               min_samples_leaf=10,
                               min_samples_split=15,
                               n_estimators=100,
                               subsample=0.7000000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #27
0
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.8342718814237802
exported_pipeline = make_pipeline(
    make_union(ZeroCount(), FunctionTransformer(copy)),
    RFE(estimator=ExtraTreesClassifier(criterion="gini",
                                       max_features=0.6000000000000001,
                                       n_estimators=100),
        step=0.7000000000000001),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="gini",
                         max_features=0.9000000000000001,
                         min_samples_leaf=4,
                         min_samples_split=15,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #28
0
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.84550605863897
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10),
            RFE(estimator=ExtraTreesClassifier(criterion="gini",
                                               max_features=0.5,
                                               n_estimators=100),
                step=0.2), ZeroCount(), MinMaxScaler()),
        FunctionTransformer(copy)), Normalizer(norm="max"),
    XGBClassifier(learning_rate=0.01,
                  max_depth=6,
                  min_child_weight=7,
                  n_estimators=600,
                  nthread=1,
                  subsample=0.9500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.7291721934005595
exported_pipeline = make_pipeline(
    ZeroCount(), SelectPercentile(score_func=f_classif, percentile=66),
    LogisticRegression(C=0.0001, dual=False, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #30
0
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from tpot.builtins import StackingEstimator, ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.47500297900297905
exported_pipeline = make_pipeline(
    make_union(MinMaxScaler(), make_pipeline(ZeroCount(), MinMaxScaler())),
    ZeroCount(),
    StackingEstimator(estimator=BernoulliNB(alpha=100.0, fit_prior=False)),
    LogisticRegression(C=0.01, dual=True, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #31
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-0.0004955828805812525
exported_pipeline = make_pipeline(
    ZeroCount(),
    XGBRegressor(learning_rate=0.1,
                 max_depth=8,
                 min_child_weight=16,
                 n_estimators=100,
                 nthread=1,
                 subsample=1.0))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #32
0
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-15.336456888232188
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=89), ZeroCount(),
    StackingEstimator(
        estimator=GradientBoostingRegressor(alpha=0.75,
                                            learning_rate=0.01,
                                            loss="quantile",
                                            max_depth=1,
                                            max_features=0.35000000000000003,
                                            min_samples_leaf=4,
                                            min_samples_split=17,
                                            n_estimators=100,
                                            subsample=0.9000000000000001)),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    XGBRegressor(learning_rate=0.01,
                 max_depth=6,
                 min_child_weight=9,
                 n_estimators=100,