Пример #1
0
sample_df['Model'] = df['Model']

# We hot encoding the cylinder columns
origin_dummies = pd.get_dummies(df['Cylinders'])

# Add origin encoding
for origin_column in list(origin_dummies):
    sample_df[origin_column] = origin_dummies[origin_column]

X_train, X_test, y_train, y_test = train_test_split(sample_df,
                                                    labels,
                                                    train_size=0.7)

le = preprocessing.LabelEncoder()

tpot = TPOTClassifier(generations=7, population_size=15, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_cars_pipeline.py')

#tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
#Best pipeline: GradientBoostingClassifier(RobustScaler(input_matrix), GradientBoostingClassifier__learning_rate=1.0, GradientBoostingClassifier__max_depth=5, GradientBoostingClassifier__max_features=0.25, GradientBoostingClassifier__min_samples_leaf=DEFAULT, GradientBoostingClassifier__min_samples_split=17, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.7)
# 0.770491803279

#tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
#Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=DEFAULT, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=7, ExtraTreesClassifier__n_estimators=DEFAULT)
#0.762295081967

#Sin MPG
#tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
#Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=DEFAULT, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=6, ExtraTreesClassifier__n_estimators=DEFAULT)
Пример #2
0
X_train.shape, X_test.shape, y_train.shape, y_test.shape


# In[3]:


from config.classifier_models_only import classifier_config_dict
time_allocated = 60


# In[4]:


tpot = TPOTClassifier(
    max_time_mins=time_allocated,
    config_dict=classifier_config_dict,
    verbosity=3,
    scoring="neg_log_loss",
    n_jobs=8)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))


# In[4]:


# tpot.export('tpot_mnist_pipeline.py')


# In[ ]:

Пример #3
0
        'sklearn.feature_selection.SelectFromModel': {
            'threshold': np.arange(0, 1.01, 0.05),
            'estimator': {
                'sklearn.ensemble.ExtraTreesClassifier': {
                    'n_estimators': [100],
                    'criterion': ['gini', 'entropy'],
                    'max_features': np.arange(0.05, 1.01, 0.05)
                }
            }
        }
    }
    # generations 确定子代的迭代次数
    # population_size=10 是创建个体的初始数量
    # offspring_size 每一代所需创造个体数
    # crossover_rate 用于创造后代的个体所占的百分比
    # mutation_rate 属性值随机更改的概率

    # 基于遗传算法的一个东西

    tpot = TPOTClassifier(generations=1,
                          population_size=10,
                          verbosity=2,
                          config_dict=tpot_config)
    tpot.fit(X_train, y_train)
    tpot.score(X_test, y_test)

    tpot.export('/Users/sheng/PycharmProjects/untitled/guowei/chishi.py')

    #tpot.score()
    # tpot.export(result.py)    导出标准的scikit-learn代码
Пример #4
0
"""**7. Selecting model using TPOT**


"""

# Import TPOTClassifier and roc_auc_score

from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
# Instantiate TPOTClassifier

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(X_train, y_train)

# AUC score for tpot model

tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')
print('\nbest pipeline steps:',end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
  print(f'{idx}. {transform}')

"""**8. Checking the variance**
Пример #5
0
data = pd.read_excel(path, header=1, index_col=0)
data = data.rename(columns={'default payment next month': "default"})

print2(data.head())
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1],
                                                    data.iloc[:, -1],
                                                    stratify=data.iloc[:, -1],
                                                    test_size=0.3)
print2(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Assign the values outlined to the inputs
number_generations = 3
population_size = 5
offspring_size = 10
scoring_function = "accuracy"

# Create the tpot classifier
tpot_clf = TPOTClassifier(generations=number_generations,
                          population_size=population_size,
                          offspring_size=offspring_size,
                          scoring=scoring_function,
                          verbosity=2,
                          random_state=2,
                          cv=2)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))
Пример #6
0
import numpy as np

#load the data
telescope = pd.read_csv('MAGIC Gamma Telescope Data.csv')

#clean the data
telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))]
tele = telescope_shuffle.reset_index(drop=True)

#Store 2 classes
tele['Class'] = tele['Class'].map({'g': 0, 'h': 1})
tele_class = tele['Class'].values

#Split training, testing, and validation data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(
    tele.index, stratify=tele_class, train_size=0.75, test_size=0.25)

#Let Genetic Programming find best ML model and hyperparameters
tpot = TPOTClassifier(generations=5, verbosity=2)
tpot.fit(
    tele.drop('Class', axis=1).loc[training_indices].values,
    tele.loc[training_indices, 'Class'].values)

#Score the accuracy
tpot.score(
    tele.drop('Class', axis=1).loc[validation_indices].values,
    tele.loc[validation_indices, 'Class'].values)

#Export the generated code
tpot.export('pipeline.py')
y_test = Label(y_test_string)

print(sum(y_test==1))
print(len(y_test))
print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test))
"""
-------------- TPOT does is magic-------------------------------------
"""


print(sum(y_test==1))
print(len(y_test))
print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test))

from tpot import TPOTClassifier
clf=TPOTClassifier(verbosity=2,n_jobs=-1)
clf.fit(X_train,y_train)

print(sum(y_test==1))
print(len(y_test))'

print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test))


print('test score=',clf.score(X_test,y_test))
predictions = clf.predict(X_test)
print(confusion_matrix(y_test,predictions))


#digits=load_digits()
#X=digits['data']
Пример #8
0
 def tpot(self):
     from tpot import TPOTClassifier
     tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
     tpot.fit(self.train_data, self.train_label)
     print(tpot.score(self.predi_data, self.predi_label))
Пример #9
0
y = df['Result_of_Treatment']
y.head()
X = df.drop('Result_of_Treatment', axis=1)
X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape
X_test.shape
y_train.shape

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled.shape
clf = LogisticRegression()
svmclf = SVC(kernel='rbf')
rfclf = RandomForestClassifier()
tpotclf = TPOTClassifier()
model1 = clf.fit(X_train, y_train)
model2 = svmclf.fit(X_train, y_train)
model3 = rfclf.fit(X_train, y_train)
model_auto_clf = tpotclf.fit(X_train)
score = cross_val_score(clf, X_train, y_train)
score2 = cross_val_score(svmclf, X_train, y_train)
score3 = cross_val_score(rfclf, X_train, y_train)
print("score is:%.2f\n,", score3)

##Tpot classifier
tpotclf = TPOTClassifier(generations=5, cv=5)
model_tpot_clf = tpotclf.fit(X_train, y_train)
score = tpotclf.score(X_test, y_test)
print(score)
tpotclf.export('classifier-pipeline.py')
label_type = parse_dataset.label_type(y, stated_input_column)
score_type = "accuracy"
if label_type == "Ordinal":
    models = model_info.fast_ordinal_models()
    score_type = "r2"

tpot = None
if label_type == "Ordinal":
    tpot = TPOTRegressor(generations=5,
                         population_size=20,
                         verbosity=2,
                         max_eval_time_mins=40,
                         scoring='r2')
else:
    tpot = TPOTClassifier(generations=5,
                          population_size=20,
                          verbosity=2,
                          max_eval_time_mins=40)


@timeout(7200)
def train_tpot_model(x, y, tpot):
    X_train, X_test, y_train, y_test = train_test_split(x[0:10000],
                                                        y[0:10000],
                                                        train_size=0.8,
                                                        test_size=0.2)
    tpot.fit(np.array(X_train), np.array(y_train))
    return tpot


error = None
try:
Пример #11
0
     tr_Y = deepcopy(train_Y).astype(np.int32)
 else:
     tr_X = deepcopy(train_X)
     tr_Y = deepcopy(train_Y).astype(np.int32)
 print(tr_X.shape, tr_Y.shape)
 for a in range(int(args.augs)):
     current_aug = train_X * np.random.normal(
         size=train_X.shape, loc=1, scale=0.1)
     tr_X = np.concatenate([tr_X, current_aug]).astype(np.float32)
     tr_Y = np.concatenate([tr_Y, train_Y]).astype(np.int32)
 print(tr_X.shape, tr_Y.shape)
 if "1hot" not in args.dataset:
     tpot = TPOTClassifier(generations=20,
                           population_size=5,
                           verbosity=2,
                           scoring="balanced_accuracy",
                           cv=10,
                           config_dict="TPOT light",
                           random_state=int(args.seed))
     tpot.fit(tr_X, tr_Y)
     tr_Yhat = tpot.fitted_pipeline_.predict(tr_X)
     train_Yhat = tpot.fitted_pipeline_.predict(train_X)
     test_Yhat = tpot.fitted_pipeline_.predict(test_X)
     tpot.export(args.output + ".py")
     joblib.dump(tpot.fitted_pipeline_, args.output + ".joblib")
 else:
     auto = AutoNetImageClassification("medium_cs",
                                       log_level='info',
                                       max_runtime=100,
                                       min_budget=15,
                                       max_budget=45)
Пример #12
0
    print("Data X contains NaN values")
    
df.dropna()


# In[18]:


n = 50
B = np.array_split(df,n)


# In[ ]:


pipeline_optimizer = TPOTClassifier(generations = 100, warm_start = True, verbosity=2, max_time_mins=60, early_stop = 5)
#f= open("Test_scores.txt","a+")

#Initialization

X = B[0].iloc[:,0:-1]
y = B[0].iloc[:,-1]

start = time.time()
pipeline_optimizer.fit(X, y)


for i in range(1,n):
    X = B[i].iloc[:,0:-1]
    y = B[i].iloc[:,-1]
    accuracy = pipeline_optimizer.score(X, y)
Пример #13
0
    Xtest = scaler.transform(Xtest)

    n_neigh = 27
    print('n adasyn', n_neigh)
    ada = ADASYN(random_state=91,
                 n_neighbors=n_neigh,
                 sampling_strategy=1,
                 n_jobs=6)
    Xtrain, ytrain = ada.fit_resample(Xtrain, ytrain)
    '''Optimización RF'''

    tpot_classifier = TPOTClassifier(
        generations=5,
        population_size=10,
        offspring_size=5,
        verbosity=2,
        early_stop=3,
        config_dict={'sklearn.ensemble.RandomForestClassifier': parameters},
        cv=4,
        scoring='roc_auc',
        n_jobs=12)
    '''Ajuste del modelo'''
    tpot_classifier.fit(Xtrain, ytrain)
    '''Predicción'''
    rf_y_pred = tpot_classifier.predict(Xtest)
    rf_y_prob = [probs[1] for probs in tpot_classifier.predict_proba(Xtest)]

    ypred_df = pd.DataFrame(rf_y_pred, columns=['Label pred'])

    pathy_pred = 'C:/Users/jkgv1/OneDrive/Escritorio/' + 'ypred' + 'fold' + str(
        fold) + '.xlsx'
Пример #14
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
import lightgbm as lgb
from tpot import TPOTClassifier
from sklearn import preprocessing



if __name__ == "__main__":
    train_df = pd.read_csv('./new_train.csv', index_col=0)
    X_train = train_df.drop(columns=['type']).values
    y_train = train_df['type'].values

    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    pipeline_optimizer = TPOTClassifier(generations=100, population_size=100, cv=5, scoring='f1_macro',
                                        random_state=42, verbosity=2, n_jobs=8)
    pipeline_optimizer.fit(X_train, y_train)
    pipeline_optimizer.export('tpot_exported_pipeline.py')
X = df.drop('event', axis=1)
y = df.event

# Encode y like this
np.sort(y.unique())
y = y.astype('category').cat.codes

# %%
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    random_state=42)

# %%
# PCA
# Scale data first
scale = StandardScaler()
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)

pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# %%
tpot = TPOTClassifier(verbosity=2, random_state=42)
tpot.fit(X_train_pca, y_train)
print(tpot.score(X_test_pca, y_test))
tpot.export('tpot_project_pipeline.py')
    def prediction(self):
        """
        :return:
                LR模型的准确率为80.92% 交叉验证的准确率为80.59% 参数为:penalty='l1', tol=1e-6
                bagging_cart模型的准确率为96.52% 交叉验证的准确率为80.25% 参数为:n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True,
                                         bootstrap_features=False, n_jobs=-1
                TOPT模型的准确率为90.24% 选择的最佳参数为81.48%:RandomForestClassifier(input_matrix, bootstrap=True,
                                                    criterion=entropy, max_features=0.8500000000000001, min_samples_leaf=1, min_samples_split=16, n_estimators=100)

        """
        print('-' * 30 + '乘 客 幸 存 情 况 预 测' + '-' * 30)
        train_data = self.train_data.filter(
            regex=
            'Survived|Age_scaled|SibSp|Parch|Fare_scaled|Embarked_.*|Sex_.*|Pclass_.*'
        )
        train_data = train_data.as_matrix()
        test_data = self.test_data.filter(
            regex=
            'Age_scaled|SibSp|Parch|Fare_scaled|Embarked_.*|Sex_.*|Pclass_.*')
        test_data = test_data.as_matrix()

        train_x = train_data[:, 1:]
        train_y = train_data[:, 0]

        # 使用逻辑回归模型
        model_lr = LogisticRegression(penalty='l1', tol=1e-6)
        model_lr.fit(train_x, train_y)
        predictions = model_lr.predict(test_data)
        result = pd.DataFrame({
            'PassenderId':
            self.test_data['PassengerId'].as_matrix(),
            'Survived':
            predictions.astype(np.int32)
        })
        print(result[:5])
        # 使用训练集得到模型准确率
        predictions = model_lr.predict(train_x)
        print('lr模型准确率为%.4lf' % accuracy_score(train_y, predictions))
        print('lr模型使用交叉验证的准确率为%.4lf\n\n' %
              np.mean(cross_val_score(model_lr, train_x, train_y, cv=10)))

        # 使用决策树进行模型融合
        model_cart = DecisionTreeClassifier()
        bagging_cart = BaggingClassifier(model_cart,
                                         n_estimators=20,
                                         max_samples=0.8,
                                         max_features=1.0,
                                         bootstrap=True,
                                         bootstrap_features=False,
                                         n_jobs=-1)
        bagging_cart.fit(train_x, train_y)
        # 使用训练集得出模型准确率
        predictions = bagging_cart.predict(train_x)
        print('bagging_cart模型准确率为%.4lf' % accuracy_score(train_y, predictions))
        print('bagging_cart模型使用交叉验证的准确率为%.4lf\n\n' %
              np.mean(cross_val_score(bagging_cart, train_x, train_y, cv=3)))

        # 使用TPOT模型
        model_tpot = TPOTClassifier(generations=5,
                                    population_size=20,
                                    verbosity=2)
        model_tpot.fit(train_x, train_y)
        # 使用训练集得出模型准确率
        predictions = model_tpot.predict(train_x)
        print('TPOT模型准确率为%.4lf' % accuracy_score(train_y, predictions))
        print('TPOT模型使用交叉验证的准确率为%.4lf\n\n' %
              np.mean(cross_val_score(model_tpot, train_x, train_y, cv=3)))
Пример #17
0
def genetic_algorithm(X_train, X_test, y_train, y_test):
    from tpot import TPOTClassifier

    tpot = TPOTClassifier(generations=100, population_size=20, verbosity=2)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
Пример #18
0
from tpot import TPOTClassifier
from tools import prepare_dataset

y, x = prepare_dataset()

x_train = x[:614]
y_train = y[:614].reshape(-1, )

x_valid = x[614:]
y_valid = y[614:].reshape(-1, )

pipeline_optimizer = TPOTClassifier(generations=50,
                                    population_size=20,
                                    cv=5,
                                    random_state=42,
                                    verbosity=2)

pipeline_optimizer.fit(x_train, y_train)
print(pipeline_optimizer.score(x_train, y_train))

pipeline_optimizer.export('tpot_exported_pipeline.py')
Пример #19
0
    """

    # TAKE SAMPLE DATA AND CAPTURE SIZE ------------------------------------------
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        train_size=0.00025)

    # capture the number of rows and features in the train dataset to be added in our results
    xt_nrows = int(x_train.shape[0])
    xt_numb_feats = int(x_train.shape[1])

    # BUILD AND RUN THE TPOT -------------------------------------------------------
    my_tpot = TPOTClassifier(generations=25,
                             population_size=50,
                             n_jobs=2,
                             verbosity=2,
                             scoring=score_type,
                             cv=5,
                             random_state=1776,
                             warm_start=True)

    my_tpot.fit(x_train, y_train)

    # DETERMINE BEST CV SCORE PIPELINE ----------------------------------------------
    best_pipes = my_tpot.pareto_front_fitted_pipelines_
    len_best_pipes = len(best_pipes)
    best_pipe_key = list(
        best_pipes.keys())[(len_best_pipes -
                            1)]  # key is entire pipeline as string

    best_cv = abs(my_tpot.evaluated_individuals_[best_pipe_key][1])
Пример #20
0
    df = pd.read_csv('data/datalab_persona_run1_with_scale_cat.csv')

    target = df['FKSmoker'].values

    df.drop(['FKSmoker'], inplace=True, axis=1)

    cols = [
        x for x in df.columns.values if x not in
        ['Age Next at DOC', 'Height', 'Weight', 'Annual Salary', 'Travel %']
    ]

    df = pd.get_dummies(df, columns=cols)

    data = df.values

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        train_size=0.75,
                                                        test_size=0.25)

    tpot = TPOTClassifier(generations=100,
                          population_size=100,
                          verbosity=2,
                          n_jobs=2,
                          config_dict='TPOT sparse',
                          scoring='balanced_accuracy')
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_fmi_pipeline_sparse_100_100_cat.py')
Пример #21
0
    def train(self):
        print('in tpot training')
        try:
            # Storing save location for models
            dump_file = os.path.join(
                AUTO_ML_MODELS_PATH, 'tpot_' +
                str(datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) +
                '.dump')

            x = numpy.load(
                os.path.join(AUTO_ML_DATA_PATH, self.training_data_filename))
            y = numpy.load(
                os.path.join(AUTO_ML_DATA_PATH, self.training_labels_filename))

            if self.preprocessing_object.input_data_type == 'png':
                x = reformat_data(x)

            # training the models
            print('about to train')
            model = TPOTClassifier(
                # verbosity=2, max_time_mins=90, max_eval_time_mins=5, config_dict='TPOT light', population_size=4, generations=3, n_jobs=1)
                generations=self.generations,
                population_size=self.population_size,
                offspring_size=self.offspring_size,
                mutation_rate=self.mutation_rate,
                crossover_rate=self.crossover_rate,
                scoring=self.scoring,
                cv=self.cv,
                subsample=self.subsample,
                n_jobs=self.n_jobs,
                max_time_mins=self.max_time_mins,
                # Tpot takes input in mins while most other frameworks take inputs in seconds.
                max_eval_time_mins=self.max_eval_time_mins,
                random_state=self.random_state,
                config_dict=self.config_dict,
                warm_start=self.warm_start,
                memory=self.memory,
                use_dask=self.use_dask,
                early_stop=self.early_stop,
                verbosity=self.verbosity,
                disable_update_check=self.disable_update_check)
            print('before training start')
            start = time.time()
            model.fit(x, y)
            end = time.time()
            print('training finnished')

            with open(dump_file, 'wb') as f:
                print('about to save!')
                pickle.dump(model.fitted_pipeline_, f)
                print('model saved')

            self.training_time = round(end - start, 2)
            self.model_path = dump_file
            self.status = 'success'
            self.save()
            self.additional_remarks = str(model.fitted_pipeline_)
            self.save()

        except Exception as e:
            end = time.time()
            if 'start' in locals():
                self.training_time = round(end - start, 2)

            self.status = 'fail'
            self.additional_remarks = e
            self.save()
Пример #22
0
                    features[key] = value

            # append dictionaries of each line to a meta list to be transormed
            # to a DataFrame
            meta_list.append(features)

    return pd.DataFrame(meta_list)


sparse_train_df = dense2sparse(train, test)
sparse_test_df = dense2sparse(train, test, is_test=True)

######################################################################
# feature engineering

#######################################################################
# TPOT
my_tpot = TPOTClassifier(generations=10)
my_tpot = my_tpot.fit(sparse_train_df, label)
tpot_output = my_tpot.predict(sparse_test_df)

# RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)

forest = rf.fit(sparse_train_df, label)

output = forest.predict(sparse_test_df)

np.savetxt(r'submissions/submission1.txt', output, fmt='%s')
Пример #23
0
]],
                 dtype=int)

with open("./classification_tables.pkl", 'rb') as fp:
    datasets = pickle.load(fp)

data = datasets[0]['data']

X = data.iloc[:, :-1].to_numpy(dtype=int)
y = data.iloc[:, -1].to_numpy(dtype=int)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

tpot = TPOTClassifier(generations=20,
                      population_size=20,
                      n_jobs=8,
                      verbosity=2,
                      scoring='balanced_accuracy')
tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))

output_fname = f"pipeline_{datasets[0]['assay']}.py"
tpot.export(output_fname)

print("## EXPORTED FILE: ##")
with open(output_fname, 'r') as fp:
    print(fp.read())
print("## END EXPORTED FILE ##")

print("## PFHXS PREDICTED PROBABILITY:")
Пример #24
0
                                                    train_size=0.75,
                                                    test_size=0.25,
                                                    random_state=seed)

tpot_config = {
    'xgboost.sklearn.XGBClassifier': {
        'max_depth': [2, 3, 4],
        'learning_rate': [1.0],
        'silent': [1.0],
        'n_estimators': [5, 10, 15]
    }
}

pipeline_optimizer = TPOTClassifier(generations=5,
                                    population_size=20,
                                    cv=10,
                                    random_state=seed,
                                    verbosity=3,
                                    periodic_checkpoint_folder='checkpoints',
                                    config_dict=tpot_config)

pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline.py')

ei = pipeline_optimizer.evaluated_individuals_

joblib.dump(ei, 'evaluated_individuals.pkl')
tmp = joblib.load('evaluated_individuals.pkl')
print "read back from joblib:", tmp
Пример #25
0
alg = dict_to_obj(algorithm)
if not hasattr(alg, 'automl'):
    alg.automl = True
if not hasattr(alg, 'sampling'):
    alg.sampling = False

model = None
if alg.is_supervised:
    # -------------------------------------------------------------
    # Classification algorithms
    #
    if alg.name == 'TPOT_Classifier':
        from tpot import TPOTClassifier
        model = TPOTClassifier(
            generations=alg.generations,
            cv=alg.cv,
            scoring=alg.scoring,
            verbosity=alg.verbosity
        )
    elif alg.name == 'AutoSklearn_Classifier':
        from autosklearn import classification
        if alg.sampling:
            model = classification.AutoSklearnClassifier(
                time_left_for_this_task=alg.task_time,
                per_run_time_limit=alg.run_time,
                resampling_strategy=alg.sampling_strategy,
                resampling_strategy_arguments={'folds': alg.folds}
            )
        else:
            model = classification.AutoSklearnClassifier(
                time_left_for_this_task=alg.task_time,
                per_run_time_limit=alg.run_time
Пример #26
0
"""

# dataset preparation
from tpot import TPOTClassifier
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB
from sklearn import pipeline

data = pd.read_csv('../dataset/bbc_articles_labels_all.csv')

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    data['text'], data['category'])

#Label encoding the target variables
from sklearn.preprocessing import LabelEncoder
categoryLableEncoder = LabelEncoder()
train_y = categoryLableEncoder.fit_transform(train_y)
valid_y = categoryLableEncoder.transform(valid_y)

tfidf_transformer = TfidfVectorizer()
X_train_tfidf = tfidf_transformer.fit_transform(train_x)

X_train_df = pd.DataFrame(X_train_tfidf.toarray())

tpot_clf = TPOTClassifier(generations=10)
tpot_clf.fit(X_train_df, train_y)
Пример #27
0
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data.astype(np.float64),
    iris.target.astype(np.float64),
    train_size=0.75,
    test_size=0.25)
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.predict(X_test))
print(tpot.predict_proba(X_test))
print(tpot.score(X_test, y_test))
Пример #28
0
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-njobs', dest='njobs', type=int, required=True)
args = parser.parse_args()

digits = load_digits()

random_seed = 0

X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target,
                                                    train_size=0.75,
                                                    test_size=0.25,
                                                    random_state=random_seed)

tpot = TPOTClassifier(generations=5,
                      population_size=40,
                      cv=5,
                      n_jobs=args.njobs,
                      random_state=random_seed,
                      verbosity=2,
                      use_dask=False)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
Пример #29
0
if target_label == "linear_label":
    print("Running regression")
    tpot = TPOTRegressor(population_size=population_size,
                         verbosity=2,
                         random_state=random_state,
                         cv=kf,
                         n_jobs=n_jobs,
                         max_time_mins=max_time_mins,
                         max_eval_time_mins=max_eval_time_mins,
                         config_dict=config_dict)
    tpot.fit(X_train, y_train)
else:  #label or granular label
    score_function = "macroF1MinusGreen" if target_label == "label" else "macroF1FromGranular"
    if tryCrisisFocus:
        score_function = "macroF1MinusCrisis"
    tpot = TPOTClassifier(population_size=population_size,
                          verbosity=2,
                          scoring=score_function,
                          random_state=random_state,
                          cv=kf,
                          n_jobs=n_jobs,
                          max_time_mins=max_time_mins,
                          max_eval_time_mins=max_eval_time_mins,
                          config_dict=config_dict,
                          memory='auto',
                          periodic_checkpoint_folder=checkpoint_folder)
    tpot.fit(X_train, y_train)

tpot.export(full_tpot_out_filename)
Пример #30
0
def test_get_by_name():
    """Assert that the Operator class returns operators by name appropriately"""
    tpot_obj = TPOTClassifier()
    assert get_by_name("SelectKBest", tpot_obj.operators).__class__ == TPOTSelectKBest.__class__