示例#1
0
def test_fit():
    """Assert that the TPOT fit function provides an optimized pipeline"""
    tpot_obj = TPOT(random_state=42, population_size=1, generations=1, verbosity=0)
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert tpot_obj.gp_generation == 0
示例#2
0
def test_fit():
    """Assert that the TPOT fit function provides an optimized pipeline"""
    tpot_obj = TPOT(random_state=42,
                    population_size=1,
                    generations=1,
                    verbosity=0)
    tpot_obj.fit(training_features, training_classes)

    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
    assert tpot_obj.gp_generation == 0
phenotype = load_gametes['Class']
individuals = load_gametes.drop('Class', axis=1)

X_train, X_test, y_train, y_test = train_test_split(individuals,
                                                    phenotype,
                                                    train_size=0.75,
                                                    test_size=0.25)

# Expert Knowledge Filter & MDR
tpot = TPOT(generations=200,
            population_size=200,
            verbosity=2,
            expert_source=load_ekf)
t1 = time.time()
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
t2 = time.time()
print("Time lapsed: ", t2 - t1)

# MDR Only
#tpot = TPOT(generations=500, population_size=350, verbosity=2, expert_source=None)
#t1 = time.time()
#tpot.fit(X_train, y_train)
#print(tpot.score(X_test, y_test))
#t2 = time.time()
#print("Time lapsed: ", t2 - t1)

# Random Forest
#clf = RandomForestClassifier(max_depth=5, max_features=len(X_train.columns),
#                             n_estimators=1000)
from tpot import TPOT
from sklearn.cross_validation import train_test_split
import pandas as pd 
import numpy as numpy

telescope = pd.read_csv("MAGIC Gamma Telescope Data.csv")

#clean the data
telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))]
tele = telescope_shuffle.reset_index(drop=True)

#Store classes
tele['Class'] = tele['Class'].map({'g':0, 'h':1})
tele_class = tele['Class'].values

#Split data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(tele.index,
		stratify= tele_class, train_size=0.75, test_size=0.25)

#find best model
tpot = TPOT(generations=5, verbosity=2)
tpot.fit(tele.drop('Class', axis=1).loc[training_indices].values,
	tele.loc[training_indices, "Class"].values)

#Score the accuracy
tpot.score(tele.drop('Class', axis=1).loc[validation_indices].values,
	tele.loc[validation_indices, 'Class'].values)

#Export generated code
tpot.export('pipeline.py')
    'z': 2,
    'rx': 3,
    'ry': 4,
    'rz': 5
})
data_class = data['Class'].values

# split training, testing, and validation data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(
    data.index, stratify=data_class, train_size=0.75, test_size=0.25)

# Let Genetic Programming to find the best ML model and hyperparamters
# Verbosity 2 shows a loading bar
tpot = TPOT(generations=5, verbosity=2)
tpot.fit(
    data.drop('Class', axis=1).loc[training_indices].values,
    data.loc[training_indices, 'Class'].values)

# Score the accuracy
tpot.score(
    data.drop('Class', axis=1).loc[validation_indices].values,
    data.loc[validation_indices, 'Class'].values)

# export the generated code
tpot.export('pipeline.py')

## compute sigmoid nonlinearity
## normalizes numbers given
#def sigmoid(x):
#    output = 1/(1+np.exp(-x))
#    return output
示例#6
0
from tpot import TPOT
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOT(generations=5,verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export('tpot_iris_pipeline.py')
train = train.drop(drop_list,axis=1)
train = train[0:3000000:300]
train.info(memory_usage='deep')



X = train.drop("hotel_cluster",axis=1).values
y = train.loc[: , "hotel_cluster"].values

del train
import gc
gc.collect()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25)

print("got here!")

my_tpot = TPOT(generations=20,verbosity=2,population_size=5) # seems to have a problem with pop <5
# gen 1-> really means two generations!

start = time.clock()
print(start)
my_tpot.fit(X_train, y_train)
my_tpot.export('tpot_expedia_pipeline.py')
end = time.clock()
duration = end - start
score = my_tpot.score(X_test, y_test)
print(duration,score)