示例#1
0
文件: tests.py 项目: rhiever/tpot
def test_score():
    """Assert that the TPOT score function raises a ValueError when no optimized pipeline exists"""

    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.score(testing_features, testing_classes)
        assert False  # Should be unreachable
    except ValueError:
        pass
示例#2
0
def test_score_2():
    """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTClassifier()
    known_score = 0.977777777778  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
    'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)
    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
示例#3
0
文件: tests.py 项目: rhiever/tpot
def test_score_2():
    """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOTClassifier()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    known_score = 0.986318199045  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
示例#4
0
 def tpot(self):
     from tpot import TPOTClassifier
     tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
     tpot.fit(self.train_data, self.train_label)
     print(tpot.score(self.predi_data, self.predi_label))
示例#5
0
train.Cabin = train.Cabin[train.Cabin!='T']



# Dropping Unuses Columns
train = train.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis='columns')
test = test.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis='columns')


print('train_shape={},test_shape={}'.format(train.shape,test.shape))



# Doing and train validation split
y = train.pop('Survived')
X = train

train_X,validation_X,train_y,validation_y = train_test_split(X,y,test_size=0.3,random_state=42)


# Fitting a TPOT classification model
# Change max_time_mins for the amount of time you want to train 
tpot = TPOTClassifier(verbosity = 2,max_time_mins=1)
tpot.fit(train_X,train_y)
print(tpot.score(validation_X,validation_y))


print(tpot.fitted_pipeline_)

tpot.export('tpot_titanic.py')
示例#6
0
import numpy as np 
import pandas as pd 
from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error 
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb 

dfee0 = pd.read_csv("dfee0new.csv")
df = dfee0
dfee0['status'] = dfee0['status'].astype(int)

target = 'status'
IDcol = 'uid'
predictors = [x for x in df.columns if x not in [target, IDcol]]
X_train, X_test, y_train, y_test = train_test_split(dfee0[predictors], dfee0[target], train_size = 0.75, test_size = 0.25)

tpot = TPOTClassifier(generations = 100, population_size = 100, verbosity = 2, n_jobs = 8)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_report_pipeline.py')

    # get train and test data
    X_train, X_test, y_train, y_test = train_test_split(alldata,
                                                        labels,
                                                        train_size=0.750,
                                                        test_size=0.250)
    if mtype in [' classification', 'c']:
        tpot = TPOTClassifier(generations=5,
                              population_size=50,
                              verbosity=2,
                              n_jobs=-1)
        tpotname = '%s_tpotclassifier.py' % (jsonfile[0:-5])
    elif mtype in ['regression', 'r']:
        tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)
        tpotname = '%s_tpotregression.py' % (jsonfile[0:-5])
    tpot.fit(X_train, y_train)
    accuracy = tpot.score(X_test, y_test)
    tpot.export(tpotname)

    # export data to .json format
    data = {
        'data': alldata.tolist(),
        'labels': labels.tolist(),
    }

    jsonfilename = '%s_.json' % (tpotname[0:-3])
    jsonfile = open(jsonfilename, 'w')
    json.dump(data, jsonfile)
    jsonfile.close()

    # now edit the file and run it
    g = open(tpotname).read()
示例#8
0
y_train = train['response']
y_test = test['response']

X_train = train.drop('response', axis=1).copy()
X_test = test.drop('response', axis=1).copy()

tpot = TPOTClassifier(verbosity=3,
                      scoring="roc_auc",
                      random_state=23,
                      n_jobs=-1,
                      generations=5,
                      population_size=10)

times = []
scores = []
winning_pipes = []

# run three iterations and time them
for x in range(3):
    start_time = timeit.default_timer()
    tpot.fit(X_train, y_train)
    elapsed = timeit.default_timer() - start_time
    times.append(elapsed)
    winning_pipes.append(tpot.fitted_pipeline_)
    scores.append(tpot.score(X_test, y_test))
    tpot.export('tpot_h2odata_pipeline.py')
times = [time / 60 for time in times]
print('Times:', times)
print('Scores:', scores)
print('Winning pipelines:', winning_pipes)
示例#9
0
def cyc():
    n = 5
    s = 0
    cl = "class"
    ans = ""
    df = pd.DataFrame(pd.read_csv('colData.csv'))
    varArr = [
        "protons", "nuetrons", "electrons", "ProtElectConfigNumb",
        "NueConfigNumb", "periodic x", "periodic y"
    ]
    for i in range(s, 7):
        ans += '[' + str(i) + '],'

        print('col' + str(i) + '.csv')
        print("data framed")

        if i != 7:
            df.drop([varArr[i]], 1, inplace=True)

        df.drop(['element'], 1, inplace=True)
        df.drop(['ProtElectConfig'], 1, inplace=True)
        df.drop(['NueConfig'], 1, inplace=True)

        if i == 7:
            df.drop(['ProtElectConfigNumb'], 1, inplace=True)
            df.drop(['NueConfigNumb'], 1, inplace=True)

        if i == 8:
            df.drop(['magicNue'], 1, inplace=True)
            df.drop(['magicPro'], 1, inplace=True)

#itertools.combinations(iterable, r)?

#df.drop(['half'],1, inplace=True)
#df.drop(['magicNue'],1, inplace=True)
#df.drop(['magicPro'],1, inplace=True)

#df.replace(NaN)
#print(df)

        X = np.array(df.drop([cl], 1))
        y = np.array(df[cl])
        print("here")

        #print(csvInate)

        #X =[[1,0,1,1,1,1,1,1],[2,2,2,2,2,2,18,1],[3,4,3,11,12,11,1,2],[4,5,4,12,121,12,2,2],[5,6,5,121,122,121,13,2],[6,6,6,222,222,222,14,2],[7,7,7,223,223,223,15,2],[8,8,8,224,224,224,16,2],[9,10,9,225,226,225,17,2],[37,48,37,41,4210,41,1,5],[115,174,115,6214103,6214103218141060,6214103,15,7],[99,153,99,6211,621410321811,6211,14,7],[73,108,73,62143,62146,62143,5,6],[25,30,25,325,3210,325,7,4],[117,175,117,6214105,6214103218141060,6214105,17,7],[86,136,86,6214106,6214103212,6214106,18,6],[36,48,36,42106,4210,42106,18,3],[30,35,30,4210,42105,4210,12,4],[112,173,112,621410,6214103218141060,621410,12,7],[110,171,110,62148,62141031814105,62148,10,7],[6,6,6,122,122,122,14,2],[14,14,14,222,222,222,14,3],[83,126,83,214103,52146,214103,15,6],[75,111,75,52145,62149,52145,7,6],[11,22,11,21,322,21,1,3],[118,176,118,6214106,6214103218141060,6214106,18,7],[24,28,24,324,328,324,6,4],[48,66,48,5210,5210,5210,12,5],[1,0,1,1,1,1,1,1],[2,2,2,2,2,2,18,1],[3,4,3,11,12,11,1,2],[4,5,4,12,121,12,2,2],[5,6,5,121,122,121,13,2],[6,6,6,222,222,222,14,2],[7,7,7,223,223,223,15,2],[8,8,8,224,224,224,16,2],[9,10,9,225,226,225,17,2],[37,48,37,41,4210,41,1,5],[115,174,115,6214103,6214103218141060,6214103,15,7],[99,153,99,6211,621410321811,6211,14,7],[73,108,73,62143,62146,62143,5,6],[25,30,25,325,3210,325,7,4],[117,175,117,6214105,6214103218141060,6214105,17,7],[86,136,86,6214106,6214103212,6214106,18,6],[36,48,36,42106,4210,42106,18,3],[30,35,30,4210,42105,4210,12,4],[112,173,112,621410,6214103218141060,621410,12,7],[110,171,110,62148,62141031814105,62148,10,7],[6,6,6,122,122,122,14,2],[14,14,14,222,222,222,14,3],[83,126,83,214103,52146,214103,15,6],[75,111,75,52145,62149,52145,7,6],[11,22,11,21,322,21,1,3],[118,176,118,6214106,6214103218141060,6214106,18,7],[24,28,24,324,328,324,6,4],[48,66,48,5210,5210,5210,12,5]]

        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            X, y, test_size=0.3)
        #clf = neighbors.KNeighborsClassifier()
        #clf=ensemble.RandomForestClassifier()
        clf = TPOTClassifier(generations=2,
                             population_size=100,
                             verbosity=2,
                             scoring="accuracy")
        #clf = ElasticNet()
        #clf = MLPClassifier()
        clf.fit(X_train, y_train)

        accuracy = clf.score(X_test, y_test)
        print("acc"+i\
              )
        print(accuracy)
        #clf.export('col2'+str(i)+'.py')
        print("EXPORTED")
        ans += str(accuracy) + ','
    print(ans)
# Date  : 2020-03-01

# 使用TPOT自动机器学习工具对MNIST进行分类
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from action import *
import numpy as np

# 加载数据
train_data, test_data = load_data()
train_data, test_data = data_fillna(train_data, test_data)
train_data, test_data = data_process(train_data, test_data)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'My']
train_data['My'] = train_data['Age'] + train_data['Sex']
test_data['My'] = test_data['Age'] + test_data['Sex']

train_labels = train_data['Survived']
train_features = train_data[features]
train_x, train_y, label_x, label_y = train_test_split(train_features, train_labels, test_size=0.3, random_state=1)
test_features = test_data[features]

tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(train_x, label_x)
print(tpot.score(train_y, label_y))
tpot.export('tpot_mnist_pipeline.py')

# output
# Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.7000000000000001, min_samples_leaf=6, min_samples_split=9, n_estimators=100)
# 0.7761194029850746
示例#11
0
qb = df[(df.Position == 'QB') & (df.AvgPts >= 12)]
rb = df[(df.Position == 'RB') & (df.AvgPts >= 8)]
wr = df[(df.Position == 'WR') & (df.AvgPts >= 8)]
te = df[(df.Position == 'TE') & (df.AvgPts >= 6)]

# need to remove nulls, but x3 is not null, so remove, then split off
X = qb[qb_features + ['x3']]
X = X[-X.isnull().any(axis=1)]
y = X['x3']
X.drop('x3', axis=1)

print('start qb model\n\n')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
print('qb model: {}'.format(tpot.score(X_test, y_test)))
print('\n\n')

X = rb[rb_features]
y = rb['x3']

print('start rb model\n\n')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
print('rb model: {}'.format(tpot.score(X_test, y_test)))

X = wr[wr_features]
y = wr['x3']

print('start wr model\n\n')
# Note: After instantiating the Client you can open
# http://localhost:8787/status to see the dashboard of workers
# To see the dashboard bokeh needs to be installed on your enviroment
from sklearn.externals import joblib
import distributed.joblib
from dask.distributed import Client
client = Client(diagnostics_port=8788, processes=False)
client

# Create Data
digits = load_digits()

# To ensure the example runs quickly, we'll make the training dataset relatively
# small.
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target,
                                                    train_size=0.75,
                                                    test_size=0.25)
# Using Dask
# scale up: Increase the TPOT parameters like population_size, generations.
# Note: When use_dask = True, TPOT will use as manu cores as avaliable on the cluster, regardless of the n_jobs specified
tp = TPOTClassifier(generations=5,
                    population_size=40,
                    cv=5,
                    random_state=0,
                    verbosity=2,
                    use_dask=True)
with joblib.parallel_backend('dask'):
    tp.fit(X_train, y_train)
print(tp.score(X_test, y_test))
示例#13
0
pipeline_optimizer = TPOTClassifier(generations = 100, warm_start = True, verbosity=2, max_time_mins=60, early_stop = 5)
#f= open("Test_scores.txt","a+")

#Initialization

X = B[0].iloc[:,0:-1]
y = B[0].iloc[:,-1]

start = time.time()
pipeline_optimizer.fit(X, y)


for i in range(1,n):
    X = B[i].iloc[:,0:-1]
    y = B[i].iloc[:,-1]
    accuracy = pipeline_optimizer.score(X, y)
    end = time.time()
#    f.write("Test batch %d - Test score %f - Duration %f\n" % (i, accuracy, end-start))
    print("Test batch %d - Test score %f - Duration %f\n" % (i, accuracy, end-start))
    #file  = 'tpot_exported_pipelinefor' + str(i) + '.py'
    #pipeline_optimizer.export(file)  
    start = time. time()
    pipeline_optimizer.fit(X, y)
    

f.close()
    


# In[ ]:
import numpy as np
import pandas as pd

from tpot import TPOTClassifier
from sklearn import preprocessing, model_selection, svm, neighbors, linear_model, discriminant_analysis, naive_bayes, \
    tree

df = pd.read_csv('forestfires.csv')
df = df.replace({'month': {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9,
                           'oct': 10, 'nov': 11, 'dec': 12},
                 'day': {'sun': 1, 'mon': 2, 'tue': 3, 'wed': 4, 'thu': 5, 'fri': 6, 'sat': 7}})

X = np.array(df.drop(['area', 'month', 'day', 'X', 'Y'], 1))
X = preprocessing.scale(X)
y = np.array(df['area'])
y = np.heaviside(y, 0)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)


pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2)


pipeline_optimizer.fit(X_train, y_train)

accuracy = pipeline_optimizer.score(X_test, y_test)

print('tpot:', accuracy)
示例#15
0
    y_train, uniques = pd.factorize(y_train, sort=False)

    starttime = datetime.now()

    # Configure
    tpotConf = f"# TPOTClassifier(verbosity=2,generations={generations},config_dict={config_dict},max_time_mins={max_time_mins},random_state={random_state},early_stop={early_stop})"
    print(tpotConf)
    Tpot = TPOTClassifier(verbosity=2,
                          generations=generations,
                          config_dict=config_dict,
                          max_time_mins=max_time_mins,
                          random_state=random_state,
                          early_stop=early_stop)
    Tpot_file = ifile.replace(
        ".csv", "_" + str(generations) + "g_" + model + "_Tpot.py")
    print(f"Fitting to file {Tpot_file}")

    # Start
    Tpot.fit(X_train, y_train)
    Tpot_score = Tpot.score(X_test, y_test)
    Tpot.export(Tpot_file)
    print(f"Score see {Tpot_file}"
          )  # The correct score is in the tpot export file
    endtime = datetime.now()
    duration = mytreat.dur(starttime, endtime)

    mycomments = tpotConf
    mycomments = mycomments + f'# ifile {ifile}  model={argv.model} Tpot_file={Tpot_file}, Tpot_score={Tpot_score}, starttime{starttime}, endtime={endtime} duration={duration}\n'
    myutils = mmutils.mmutils()
    myutils.prependComments(Tpot_file, mycomments)
示例#16
0
mlb = MultiLabelBinarizer()

job_Trans = mlb.fit_transform([{str(val)} for val in data['job'].values])
education_Trans = mlb.fit_transform([{str(val)}
                                     for val in data['education'].values])
month_Trans = mlb.fit_transform([{str(val)} for val in data['month'].values])

data_new = data.drop([
    'marital', 'default', 'housing', 'loan', 'contact', 'poutcome', 'class',
    'job', 'education', 'month'
],
                     axis=1)
data_new = np.hstack(
    (data_new.values, job_Trans, education_Trans, month_Trans))

data_class = data['class'].values

training_indices, validation_indices = training_indices, testing_indices = train_test_split(
    data.index, stratify=data_class, train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(population_size=15,
                      max_eval_time_mins=0.04,
                      max_time_mins=2,
                      verbosity=3,
                      n_jobs=-1)
tpot.fit(data_new[training_indices], data_class[training_indices])

score = tpot.score(data_new[validation_indices], data.loc[validation_indices,
                                                          'class'].values)
print(score)
示例#17
0
merged = merged.iloc[:, 2:]

yy = np.array(ydata)
datac = np.mat(merged)
#datac = np.concatenate((merged, yy),axis =1)
random.shuffle(datac)

xc_t = datac[:, :-1][:1200, ]
xc_v = datac[:, :-1][1201:, ]
yt = [x[0] for x in datac[:, -1][:1200, ].astype(np.int32).tolist()]
yv = [x[0] for x in datac[:, -1][1201:, ].astype(np.int32).tolist()]

pipeline_optimizer.fit(xc_t, yt)

print(pipeline_optimizer.score(xc_v, yv))

pipeline_optimizer.export('./tpot_exported_pipeline.py')

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    GradientBoostingClassifier(learning_rate=0.5,
                               max_depth=6,
                               max_features=0.9000000000000001,
#data,label = data[idx_row,:],label[idx_row]
#features = data
#tpot_data=pd.DataFrame({'class':label},columns=['class'])
#training_features, testing_features, training_classes, testing_classes = \
#    train_test_split(features, tpot_data['class'], random_state=42)
data,label,idx_row = np.concatenate(samples),np.concatenate(label),np.arange(0,len(label),1)
print('shuffle')
for ii in range(100):
    shuffle(idx_row)
data,label = data[idx_row,:],label[idx_row]
X_train, X_test, y_train, y_test = train_test_split(data,label,train_size=0.80)
print('model selection')
tpot = TPOTClassifier(generations=10, population_size=25,
                      verbosity=2,random_state=373849,num_cv_folds=5,scoring='roc_auc' )
tpot.fit(X_train,y_train)
tpot.score(X_test,y_test)
tpot.export('%s%s_tpot_exported_pipeline.py'%(folder,type_) )  
print('finished model selection')
"""
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import KFold
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(lambda X: X),
        FunctionTransformer(lambda X: X)
    ),
示例#19
0
#    if y_test_string[i]=='train':
#        y_test[i]=5
#    if y_test_string[i]=='zebra':
#        y_test[i]=6
#
#
#
"""
-------------- TPOT does is magic-------------------------------------
"""

from tpot import TPOTClassifier
clf = TPOTClassifier(verbosity=2, n_jobs=1)
clf.fit(X_train, y_train)

print('test score=', clf.score(X_test, y_test))
predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))

os.chdir(
    'C:/Users/Bruger/Documents/Uni/Advanche machine learning/Projekt/Code/thor_final_scripts_for_report'
)
clf.export('TPOT_EXPORT_autoML_newData_CHANNELSREMOVED_95PCA_Binary.py')

error_rate = clf.score(X_test, y_test)
number_observations = len(X_test)
print('uncertanty=',
      np.sqrt((error_rate * (1 - error_rate)) / (number_observations)))

#def f(error_rate,number_observations):
#    return np.sqrt((error_rate*(1-error_rate))/(number_observations))
示例#20
0
def main():

  # set up the path to the data sets and the data were are going to experiment 
  # with 
  base_path = '/scratch/ditzler/Git/ClassificationDatasets/csv/'
  data_setz = [#'bank',
    'blood',
    'breast-cancer-wisc-diag',
    'breast-cancer-wisc-prog',
    'breast-cancer-wisc',
    'breast-cancer',
    'congressional-voting',
    'conn-bench-sonar-mines-rocks',
    'credit-approval',
    'cylinder-bands',
    'echocardiogram',
    #'fertility',
    'haberman-survival',
    'heart-hungarian',
    'hepatitis',
    'ionosphere',
    'mammographic',
    'molec-biol-promoter',
    'musk-1',
    'oocytes_merluccius_nucleus_4d',
    'oocytes_trisopterus_nucleus_2f',
    'ozone',
    'parkinsons',
    'pima',
    #'pittsburg-bridges-T-OR-D';
    'planning',
    'ringnorm',
    #'spambase',
    'spectf_train',
    'statlog-australian-credit',
    'statlog-german-credit',
    'statlog-heart',
    'titanic',
    #'twonorm',
    'vertebral-column-2clases']

  # nsplits is like the number of cv (its bootstraps here) then set up some variales
  # to save the results to. 
  n_splitz = 10
  errors = np.zeros((len(data_setz),))
  fms = np.zeros((len(data_setz),))
  times = np.zeros((len(data_setz),))
  m = 0

  for n in range(n_splitz):
    print 'Spilt ' + str(n) + ' of ' + str(n_splitz)
    for i in range(len(data_setz)):
      print '    ' + data_setz[i]
      df = pd.read_csv(base_path + data_setz[i] + '.csv', sep=',')
      data = df.as_matrix()
      X = data[:, :-1]
      y = data[:, -1]
      X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=m)
      m += 1
      
      ts = time.time()
      tpot = TPOTClassifier(generations=10, population_size=25, verbosity=1)
      tpot.fit(X_train, y_train)
      times[i] += (time.time() - ts)

      errors[i] += (1-tpot.score(X_test, y_test))
      yhat = tpot.predict(X_test)
      fms[i] += f1_score(y_test, yhat, average='macro')
  
  errors /= n_splitz
  fms /= n_splitz
  times /= n_splitz

  df = pd.DataFrame({'errors': errors, 'fms': fms, 'times': times})
  df.to_csv(path_or_buf='tpot-results2.csv', sep=',')

  return None 
示例#21
0
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled.shape
clf = LogisticRegression()
svmclf = SVC(kernel='rbf')
rfclf = RandomForestClassifier()
tpotclf = TPOTClassifier()
model1 = clf.fit(X_train, y_train)
model2 = svmclf.fit(X_train, y_train)
model3 = rfclf.fit(X_train, y_train)
model_auto_clf = tpotclf.fit(X_train)
score = cross_val_score(clf, X_train, y_train)
score2 = cross_val_score(svmclf, X_train, y_train)
score3 = cross_val_score(rfclf, X_train, y_train)
print("score is:%.2f\n,", score3)

##Tpot classifier
tpotclf = TPOTClassifier(generations=5, cv=5)
model_tpot_clf = tpotclf.fit(X_train, y_train)
score = tpotclf.score(X_test, y_test)
print(score)
tpotclf.export('classifier-pipeline.py')
#predict for test
y_pred = clf.predict(X_test)
y1_pred = svmclf.predict(X_test)
y2_pred = rfclf.predict(X_test)
score_1 = accuracy_score(y_test, y_pred)
score_2 = accuracy_score(y_test, y1_pred)
score_3 = accuracy_score(y_test, y2_pred)
print("score is:", score_1, score_2, score_3)
示例#22
0
    # Run auto-sklearn framework on the dataset
    auto_score = -1
    try:
        automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=timeout_in_sec, n_jobs=n_jobs)
        automl.fit(X_train, y_train, feat_type=feat_type)
        #automl.fit_ensemble(y_train, ensemble_size=50)
        y_hat = automl.predict(X_test.values)
        auto_score = accuracy_score(y_test, y_hat)
    except:
        print(sys.exc_info()[0])

    # Run TPOT framework on the dataset
    tpot_score = -1
    try:
        tpot = TPOTClassifier(verbosity=0, n_jobs=n_jobs, random_state=1, max_time_mins=timeout, max_eval_time_mins=0.04, population_size=15)
        tpot.fit(X_train, y_train)
        tpot_score = tpot.score(X_test, y_test)
    except:
        print(sys.exc_info()[0])

    # Run Lite-AutoML framework on the dataset
    (best, atts, cl) = liteautoml.compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, timeout_in_sec)
    
    # Run hyperopt-sklearn on the dataset 
    hp_best = evaluate_hyperopt.compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, timeout_in_sec)
    
    # Write results and dataset details to file
    outfile.write(dataset.name + "," + str(dummy) + "," + cl + "," + str(id) + "," + str(rows) + "," + str(classes) + "," + str(auto_score) + "," + str(tpot_score) + "," + str(hp_best) + "," + str(best) +  "," + str(len(X.columns)) + "," + str(atts) +'\n')
outfile.close()
示例#23
0
from tpot import TPOTClassifier
from tools import prepare_dataset

y, x = prepare_dataset()

x_train = x[:614]
y_train = y[:614].reshape(-1, )

x_valid = x[614:]
y_valid = y[614:].reshape(-1, )

pipeline_optimizer = TPOTClassifier(generations=50,
                                    population_size=20,
                                    cv=5,
                                    random_state=42,
                                    verbosity=2)

pipeline_optimizer.fit(x_train, y_train)
print(pipeline_optimizer.score(x_train, y_train))

pipeline_optimizer.export('tpot_exported_pipeline.py')
示例#24
0
# 决策树训练
clf.fit(train_features, train_labels)

test_features = dvec.transform(test_features.to_dict(orient='record'))
# 决策树预测
pred_labels = clf.predict(test_features)

# 得到决策树准确率
#这里用训练集计算准确率不甚合理,要考虑到过拟合的情形,不过只是过一下流程,倒也不必太在意
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print("ID3 score 准确率为 %.4lf%%" % (acc_decision_tree * 100))

cls = DecisionTreeClassifier()
cls.fit(train_features, train_labels)
pred_labels = cls.predict(test_features)
acc_cart_tree = round(cls.score(train_features, train_labels), 6)
print("CART score 准确率为 %.4lf%%" % (acc_cart_tree * 100))

xg = XGBClassifier()
xg.fit(train_features, train_labels)
pred_labels = xg.predict(test_features)
acc_xgboost = round(xg.score(train_features, train_labels), 6)
print("XGboost score 准确率为 %.4lf%%" % (acc_xgboost * 100))

tpotcls = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpotcls.fit(train_features, train_labels)
pred_labels = tpotcls.predict(test_features)
acc_tpot = round(tpotcls.score(train_features, train_labels), 6)
print("TPOT score 准确率为 %.4lf%%" % (acc_tpot * 100))
tpotcls.export('tpot_titanic_pipeline.py')
X = df.drop('event', axis=1)
y = df.event

# Encode y like this
np.sort(y.unique())
y = y.astype('category').cat.codes

# %%
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    random_state=42)

# %%
# PCA
# Scale data first
scale = StandardScaler()
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)

pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# %%
tpot = TPOTClassifier(verbosity=2, random_state=42)
tpot.fit(X_train_pca, y_train)
print(tpot.score(X_test_pca, y_test))
tpot.export('tpot_project_pipeline.py')
示例#26
0
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split as split

digits = load_digits()
X_train, X_test, y_train, y_test = split(digits.data, digits.target)

pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5)
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('../models/tpot_exported_pipeline.py')
示例#27
0
def genetic_algorithm(X_train, X_test, y_train, y_test):
    from tpot import TPOTClassifier

    tpot = TPOTClassifier(generations=100, population_size=20, verbosity=2)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
示例#28
0
#Numpy for converting the column vector into an array
import numpy as np

#used in jupyter notebook for getting the data from directory
import os


#reading the data in csv format
train = pd.read_csv(r"california_housing_train.csv")
test = pd.read_csv(r"california_housing_test.csv")

#Splitting the dataset into X,Y train and test

Y_train = train[["median_house_value"]]
X_train = train.drop("median_house_value",axis=1,inplace=False)

Y_test = test[["median_house_value"]]
X_test = test.drop("median_house_value",axis=1,inplace=False)

#Creating a object of TpotClassifier which runs genetic algo for 5 generations
# and then stops

tpot = TPOTClassifier(generations=5,verbosity=2)

#Creating the model and printing the score

tpot.fit(X_train,np.ravel(Y_train))

print(tpot.score(X_test,np.ravel(Y_test)))

示例#29
0
                             random_state=1776,
                             warm_start=True)

    my_tpot.fit(x_train, y_train)

    # DETERMINE BEST CV SCORE PIPELINE ----------------------------------------------
    best_pipes = my_tpot.pareto_front_fitted_pipelines_
    len_best_pipes = len(best_pipes)
    best_pipe_key = list(
        best_pipes.keys())[(len_best_pipes -
                            1)]  # key is entire pipeline as string

    best_cv = abs(my_tpot.evaluated_individuals_[best_pipe_key][1])

    # HOLDOUT SCORE --------------------------------------------------------------
    holdout_score = my_tpot.score(x_test, y_test)
    print(holdout_score)

    print(ite)  # row_id
    print(best_pipe_key)  # best_pipe
    print(best_cv)  # best_cv
    print(holdout_score)  # holdout_score
    print(this_scoring_method)  # scoring_method
    print(xt_nrows)  # xt_rows
    print(xt_numb_feats)  # xt_numb_feats

    # replace commas in best pipeline with dashes (this is the only field with risk of commas
    best_pipe_key_no_comma = best_pipe_key.replace(",", "-")

    # generate content line regardless of if the file exists already or not
    content_line = str.format("{0}, {1}, {2}, {3}, {4}, {5}, {6}\n", ite,
示例#30
0
import numpy as np

# Load the data
telescope = pd.read_csv('../data/magic04.data.csv')

# Clean the data
telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))]
tele = telescope_shuffle.reset_index(drop=True)

# Store 2 classes
tele['Class'] = tele['Class'].map({'g': 0, 'h': 1})
tele_class = tele['Class'].values

# Split training, testing, and validation data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(
    tele.index, stratify=tele_class, train_size=0.75, test_size=0.25)

# Let Genetic Programming find best ML model and hyperparameters
tpot = TPOTClassifier(generations=5, verbosity=2)
tpot.fit(
    tele.drop('Class', axis=1).loc[training_indices].values,
    tele.loc[training_indices, 'Class'].values)

# Score the accuracy
tpot.score(
    tele.drop('Class', axis=1).loc[validation_indices].values,
    tele.loc[validation_indices, 'Class'].values)

# Export the generated code
tpot.export(pipeline.py)
print(sum(y_test==1))
print(len(y_test))
print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test))

from tpot import TPOTClassifier
clf=TPOTClassifier(verbosity=2,n_jobs=-1)
clf.fit(X_train,y_train)

print(sum(y_test==1))
print(len(y_test))'

print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test))


print('test score=',clf.score(X_test,y_test))
predictions = clf.predict(X_test)
print(confusion_matrix(y_test,predictions))


#digits=load_digits()
#X=digits['data']
#y=digits['target']
#
#X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y)
#
#from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression()
#clf.fit(X_train,y_train)
#
#result=clf.score(X_test,y_test)
示例#32
0
data = pd.read_excel(path, header=1, index_col=0)
data = data.rename(columns={'default payment next month': "default"})

print2(data.head())
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1],
                                                    data.iloc[:, -1],
                                                    stratify=data.iloc[:, -1],
                                                    test_size=0.3)
print2(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Assign the values outlined to the inputs
number_generations = 3
population_size = 5
offspring_size = 10
scoring_function = "accuracy"

# Create the tpot classifier
tpot_clf = TPOTClassifier(generations=number_generations,
                          population_size=population_size,
                          offspring_size=offspring_size,
                          scoring=scoring_function,
                          verbosity=2,
                          random_state=2,
                          cv=2)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))
示例#33
0
        'sklearn.feature_selection.SelectFromModel': {
            'threshold': np.arange(0, 1.01, 0.05),
            'estimator': {
                'sklearn.ensemble.ExtraTreesClassifier': {
                    'n_estimators': [100],
                    'criterion': ['gini', 'entropy'],
                    'max_features': np.arange(0.05, 1.01, 0.05)
                }
            }
        }
    }
    # generations 确定子代的迭代次数
    # population_size=10 是创建个体的初始数量
    # offspring_size 每一代所需创造个体数
    # crossover_rate 用于创造后代的个体所占的百分比
    # mutation_rate 属性值随机更改的概率

    # 基于遗传算法的一个东西

    tpot = TPOTClassifier(generations=1,
                          population_size=10,
                          verbosity=2,
                          config_dict=tpot_config)
    tpot.fit(X_train, y_train)
    tpot.score(X_test, y_test)

    tpot.export('/Users/sheng/PycharmProjects/untitled/guowei/chishi.py')

    #tpot.score()
    # tpot.export(result.py)    导出标准的scikit-learn代码
示例#34
0
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()

X_train, X_test, y_train,  y_test = train_test_split(digits.data, digits.target,
													train_size = 0.75, test_size = 0.25)

tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')
示例#35
0
文件: w4e2.py 项目: PavlosD/myRepo
            Xtesttokeep = Xtest
            ytesttokeep = ytest
            Xtraintokeep = Xtrain
            ytraintokeep = ytrain

        acc[i - 1] = success_rate
        n_of_est[i - 1] = i * 20

print("\n\n BEST ACCURACY SIZE ACCORDING TO VALIDATOR ", est,
      "AND IDEAL MAX_DEPTH=", bestdepth)

clf = RandomForestClassifier(n_estimators=est, max_depth=bestdepth)
clf.fit(Xtraintokeep, ytraintokeep)
y_pred = clf.predict(Xtesttokeep)
ytesttokeep = ytesttokeep.values
s = 0
for j in range(0, len(y_pred)):
    if (ytesttokeep[j] == y_pred[j]):
        s = s + 1
    success_rate2 = s / len(y_pred) * 100
print("\n\n ACCURACY (test data) WITH ", est, " estimators and max_depth=",
      bestdepth, " (chosen from validator) is ", success_rate2, "%")

#plt.plot(acc,  n_of_est)
#plt.suptitle('No of estimators VS %accuracy (validator data)')
#plt.show()

tpot = TPOTClassifier(generations=5, population_size=10, verbosity=2)
tpot.fit(Xtraintokeep, ytraintokeep)
print(tpot.score(Xtesttokeep, ytesttokeep))