def test_fit2(): """Assert that the TPOT fit function provides an optimized pipeline when config_dict is \'TPOT light\'""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light') tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None)
class TpotEstimator(BaseEstimator): def __init__(self, task, **kwargs): super(TpotEstimator, self).__init__(task) if task == 'regression': self.tpot = TPOTRegressor(**kwargs) else: self.tpot = TPOTClassifier(**kwargs) self.name = 'tpot' self.label_encoder = None self.obj_cols = None def train(self, X, y, X_test): self.obj_cols = column_object_category_bool(X) self.label_encoder = SafeOrdinalEncoder() X[self.obj_cols] = self.label_encoder.fit_transform(X[self.obj_cols]) self.tpot.fit(X, y) def predict_proba(self, X): X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols]) proba = self.tpot.predict_proba(X) print(f'proba.shape:{proba.shape}') return proba def predict(self, X): X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols]) return self.tpot.predict(X)
class AutomlInstance: def __init__(self, openML_id, scoring_function, memory_path = None, max_time=None): self.y_class_dict = None self.X_train, self.X_test, self.y_train, self.y_test = self.get_dataset(openML_id) if memory_path != None: if Path(memory_path).is_file(): self.tpot = TPOTClassifier(memory=memory_path,warm_start=True,scoring=scoring_function,verbosity=3) else: self.tpot = TPOTClassifier(memory=memory_path,max_time_mins=max_time, scoring=scoring_function,verbosity=3) else: self.tpot = TPOTClassifier(max_time_mins=max_time, scoring=scoring_function,verbosity=3) self.tpot.fit(self.X_train,self.y_train) def predict(self, X): return self.tpot.predict(X) def get_segments(self)->List[Segment]: segments = [] for model in self.tpot.evaluated_individuals_: try: classifier = self.tpot._toolbox.compile(creator.Individual.from_string(model, self.tpot._pset)) classifier.fit(self.X_train,self.y_train) y_pred = classifier.predict(self.X_test) segments.append(Segment(y_ground=self.y_test,y_pred=y_pred)) except ValueError: print("One classifier could not be evaluated.") except RuntimeError: print("One classifier could not be evaluated.") return segments def get_dataset(self, openMl_id, test_size=0.2): X, y = openml.fetch_openml(data_id=openMl_id, return_X_y=True) self.dataset_categories = openml.fetch_openml(data_id=31).categories openml_data = openml.fetch_openml(data_id=openMl_id, return_X_y=False) self.feature_names_X = openml_data.feature_names imp = Imputer() self.target_categories = numpy.unique(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) x_imp = imp.fit(X_train) X_train = x_imp.transform(X_train) x_imp = imp.fit(X_test) X_test = x_imp.transform(X_test) y_train = self._y_string_2_int(y_train) y_test = self._y_string_2_int(y_test) return X_train, X_test, y_train, y_test def _y_string_2_int(self, y: numpy.ndarray): if self.y_class_dict == None: self._create_class_dict(y) transdict = {y:x for x,y in self.y_class_dict.items()} return numpy.array([transdict[val] for val in y]) def _create_class_dict(self, y:numpy.ndarray): res = {} unique_values = numpy.unique(y) counter = 0 for x in unique_values.tolist(): res[counter] = x counter = counter +1 self.y_class_dict = res
def test_dask_matches(self): with dask.config.set(scheduler='single-threaded'): for n_jobs in [-1]: X, y = make_classification(random_state=0) a = TPOTClassifier( generations=2, population_size=5, cv=3, random_state=0, n_jobs=n_jobs, use_dask=False, ) b = TPOTClassifier( generations=2, population_size=5, cv=3, random_state=0, n_jobs=n_jobs, use_dask=True, ) b.fit(X, y) a.fit(X, y) self.assertEqual(a.score(X, y), b.score(X, y)) self.assertEqual(a.pareto_front_fitted_pipelines_.keys(), b.pareto_front_fitted_pipelines_.keys()) self.assertEqual(a.evaluated_individuals_, b.evaluated_individuals_)
def build_classifier(data, name): X, y = data categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) del config["sklearn.neighbors.KNeighborsClassifier"] classifier = TPOTClassifier(generations=1, population_size=3, random_state=13, config_dict=config, verbosity=2) classifier.fit(X, y) pipeline = make_pmml_pipeline(classifier.fitted_pipeline_, active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name + ".pkl") result = DataFrame(classifier.predict(X), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(X), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name + ".csv")
def run_tpot(zeros, ones): all_data, y = make_all_data(zeros, ones) X_train, X_test, y_train, y_test = train_test_split(all_data, y, test_size=.1) pca = PCA(n_components=15) X_train = pca.fit_transform(X_train) X_test = pca.fit_transform(X_test) # if not os.path.exists('tpot_checkpoint'): # os.mkdir('tpot_checkpoint') tpot = TPOTClassifier( n_jobs=-1, generations=50, verbosity=3, scoring='f1', # subsample=.5, # periodic_checkpoint_folder='tpot_checkpoint', max_eval_time_mins=30, memory='auto') tpot.fit(X_train, y_train) tpot.export('tpot_ecog_pipeline.py') results = tpot.predict(X_test) out_file = open('tpot_metrics.txt', 'w') out_file.write(sklearn.metrics.classification_report(y_test, results)) tpot.export('tpot_ecog_pipeline.py')
def main(**kwargs) -> None: # divides kwargs between `Featurizer` and `TPOTClassifier` kwargs. tpot_kwargs = {} keys = list(kwargs.keys()) for k in keys: if k in inspect.getargspec(TPOTClassifier).args: tpot_kwargs[k] = kwargs.pop(k) # loads all data into memory. paths = [os.path.join(LABELS_PATH, fname) for fname in os.listdir(LABELS_PATH)] X_raw, y = load_data(paths) X_raw.title.fillna('', inplace=True) X_raw.channel_title.fillna('', inplace=True) # splits data into train and test sets. X_train, X_test, y_train, y_test = train_test_split(X_raw, y, random_state=SEED, train_size=TRAIN_SIZE, test_size=1-TRAIN_SIZE, shuffle=True) # KLUDGE: preprocesses text deterministically (i.e. NOT part of the TPOT hyperparameter # optimization pipeline). featurizer = Featurizer(**kwargs) featurizer.fit(X_train) X_train = featurizer.transform(X_train) if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0: print(f'Beginning hyper-parameter search with training data shape: {X_train.shape}.') tpot = TPOTClassifier(**tpot_kwargs) tpot.fit(X_train, y_train) if 'periodic_checkpoint_folder' in tpot_kwargs: tpot.export(os.path.join(tpot_kwargs['periodic_checkpoint_folder'], 'best_pipeline.py')) if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0: X_test = featurizer.transform(X_test) print(f'Train set score: {tpot.score(X_train, y_train).round(4)}') print(f'Test set score: {tpot.score(X_test, y_test).round(4)}') return None
def main(): df_train = pd.read_csv('data/train_data.csv') df_valid = pd.read_csv('data/valid_data.csv') feature_cols = list(df_train.columns[:-1]) target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values tsne_data = np.load('data/tsne_2d_5p.npz') tsne_train = tsne_data['X_train'] tsne_valid = tsne_data['X_valid'] # concat features X_train_concat = np.concatenate([X_train, tsne_train], axis=1) X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1) tpot = TPOTClassifier(max_time_mins=60 * 24, population_size=100, scoring='log_loss', num_cv_folds=3, verbosity=2, random_state=67) tpot.fit(X_train_concat, y_train) print(tpot.score(X_valid_concat, y_valid)) tpot.export('tpot_pipeline.py')
def test_fit(): """Assert that the TPOT fit function provides an optimized pipeline""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0) tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None)
def classificar(): """ Extrai dataframe da tabela, converte variáveis de texto para número e preenche vazios para -1 """ dataframe = pd.read_excel('data/data.xlsx') #dataframe.rename({'CODIFICAÇÃO': 'class'}, axis='columns', inplace=True) # Binarizando variável com multiplos niveis encoder = LabelEncoder() classe_label = encoder.fit_transform(dataframe.iloc[:, 0]) print(classe_label) # Binarizando variável com dois niveis dest_autopecas = {'N': 0, 'S': 1} dataframe['DEST AUTOPECAS'] = [dest_autopecas[item] for item in dataframe['DEST AUTOPECAS']] # Preenchendo vazios com valor padrão dataframe.fillna(-1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(dataframe[LABELS].values, np.array(classe_label), test_size=0.3) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=3) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_classif_pipeline.py')
def get_tpot_word_pipeline(train_sequences, dev_sequences, train_targets, dev_targets, time_constraint=1, num_cpu=1, max_features=1000): vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=max_features) features = [('word', vectorizer)] clf = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) auml_pip = pipeline.Pipeline([('union', FeatureUnion(transformer_list=features)), ('scale', Normalizer())]) sequence_space = train_sequences.tolist() + dev_sequences.tolist() X_train = auml_pip.fit_transform(sequence_space) Y_train = np.array(train_targets.tolist() + dev_targets.tolist()) clf.fit(X_train.todense(), Y_train) return (auml_pip, clf)
class TPot(Model): def __init__(self): print("Starting t pot!") def fit(self, X, y, title=None): # For this case, X and y are the complete datasets!!! self.pipeline_optimizer = TPOTClassifier( generations=5, cv=5, random_state=42, verbosity=3, n_jobs=8, max_eval_time_mins=1,#10, scoring='f1', subsample=0.5 ) self.pipeline_optimizer.fit(X, y) if not os.path.exists("./automl"): os.makedirs("./automl") self.pipeline_optimizer.export('./automl/tpot_exported_pipeline_' + str(title) + '_.py') def predict(self, X): pass
def run_AutoTpot(self): # Running the AutoTpot pipeline automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse') automl.fit(self.train, self.y_train) # TPOT produces ready-to-run, standalone Python code for the best-performing model, # in the form of a scikit-learn pipeline. # Exporting the best models automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py')) print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_)) print('Saving the best model discovered through TPOT.') # Dumping ensemble of the models joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle')) # Calculating time per prediction # Start time ****************************************************************************** start = timeit.default_timer() # Predicting label, confidence probability on the test data set predictions = automl.predict(self.test) predictions_prob = automl.predict_proba(self.test) # Binary class values : rounding them to 0 or 1 predictions = [round(value) for value in predictions] end = timeit.default_timer() # End Time ****************************************************************************** print('Time per prediction : {}'.format((end - start) / self.test.shape[0])) self.visualize(predictions, automl)
def build_classifier(data, feature_pipeline, generations, population_size, name): X, y = data Xt = feature_pipeline.fit_transform(X) Xt = Xt.astype(float) categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) config = filter_config(config) del config[ "sklearn.naive_bayes.GaussianNB"] # Does not support nesting - see http://mantis.dmg.org/view.php?id=208 del config["sklearn.neighbors.KNeighborsClassifier"] del config[ "sklearn.svm.LinearSVC"] # Does not support classifier.predict_proba(Xt) del config["sklearn.tree.DecisionTreeClassifier"] classifier = TPOTClassifier(generations=generations, population_size=population_size, random_state=13, config_dict=config, verbosity=2) classifier.fit(Xt, y) pipeline = make_pmml_pipeline(Pipeline(steps=feature_pipeline.steps + classifier.fitted_pipeline_.steps), active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(classifier.predict(Xt), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(Xt), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name)
def cli(erv_data): # import the ERV expression data as a Pandas dataframe df = pd.read_csv(erv_data) class_codes = dict(enumerate( df['class'].astype("category").cat.categories)) df["class"] = df["class"].astype("category").cat.codes # create the test and training data X_train, X_test, y_train, y_test = train_test_split(df.values[:, 2:], df.values[:, 1], train_size=0.75, test_size=0.25) # convert them all to floats X_train, X_test, y_train, y_test = X_train.astype(float), X_test.astype( float), y_train.astype(float), y_test.astype(float) # create a pipeline pipeline_optimizer = TPOTClassifier(cv=2, verbosity=2, n_jobs=-1) pipeline_optimizer.fit(X_train, y_train) pipeline_optimizer.export('tpot_exported_pipeline.py') print(f"Validation Accuracy: {pipeline_optimizer.score(X_test, y_test)}") cm = ConfusionMatrix([class_codes[y] for y in y_test], [ class_codes[y] for y in [pipeline_optimizer.predict(x.reshape(1, -1))[0] for x in X_test] ]) cm.save_html("report")
def runTPOT(X, y, metric, algo): aml_config_dict = aml_config() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) if algo == "Classifier": pipeline_optimizer = TPOTClassifier(generations=1, population_size=5, verbosity=2, warm_start=True) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) elif algo == 'Regressor': def aml_reg_scorer(y_pred, y_test): rsme = sqrt(mean_squared_error(y_test, y_pred)) return rsme aml_custom_scorer = make_scorer(aml_reg_scorer, greater_is_better=False) pipeline_optimizer = TPOTRegressor(generations=1, population_size=5, verbosity=2, warm_start=True, scoring=aml_custom_scorer) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) else: raise Exception('Incorrect Problem Type') return pipeline_optimizer, pipeline_optimizer.score(X_test, y_test), len( pipeline_optimizer.evaluated_individuals_)
def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None, n_jobs=1 ): print(n_jobs) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) if include_preprocessors: pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None, use_dask=False, #template="Selector-Transformer-Classifier", n_jobs=n_jobs,) else: pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None, use_dask=False, template='Classifier', n_jobs=n_jobs,) pipeline_optimizer.fit(X_train, y_train) y_hat = pipeline_optimizer.predict(X_test) acc = sklearn.metrics.accuracy_score(y_test, y_hat) f1_s = sklearn.metrics.f1_score(y_test, y_hat, average='weighted') metrs = [] metrs.append("Accuracy score - " + str(acc)) metrs.append("F1 score - " + str(f1_s)) res = ["","","","",f1_s,acc,"",pipeline_optimizer.export()] return str(metrs),res
def tpotClassifier(train_data, target_value): classifier = TPOTClassifier() X_train, X_test, y_train, y_test = train_test_split( train_data, train_data[target_value], train_size=0.75, test_size=0.25) classifier.fit(X_train, y_train) score: float = classifier.score(X_test, y_test) classifier.export('my_pipeline.py') return classifier, score
def main(X_train, y_train): tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) return tpot.fitted_pipeline_
def test_fit(): """Assert that the TPOT fit function provides an optimized pipeline""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, generations=1, verbosity=0) tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert tpot_obj._gp_generation == 0 assert not (tpot_obj._start_datetime is None)
def tpot_evaluation(total_runtime, train_features, train_labels): clf = TPOTClassifier( max_time_mins=total_runtime / 60, scoring='balanced_accuracy', config_dict=classifier_config_dict_custom, ) clf.fit(train_features, train_labels) return clf
def tpot_generation(X_train, y_train, X_test, y_test): tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2, n_jobs=4) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_quiniela_pipeline.py')
def clfWithTpot(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) my_tpot = TPOTClassifier(generations=10, verbosity=2) my_tpot.fit(np.array(X_train), np.array(y_train)) print(my_tpot.score(np.array(X_test), np.array(y_test))) my_tpot.export('exported_pipeline.py') predictions = my_tpot.predict(np.array(X_test)) print(confusion_matrix(y_test, predictions))
def geneticModel(a,b): X_train, X_test, y_train, y_test = train_test_split(a, b, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=12, population_size=100, verbosity=2) tpot.fit(X_train,y_train) print(classifier.score(X_test, y_test)) return tpot
def main(): """ Uses tpot (Tree-based Pipeline Optimization Tool) an Automated Machine Learning tool to find and output the best machine learning model for the given dataset. See https://github.com/EpistasisLab/tpot Outputs the results to automodel.py """ titanic = pd.read_csv('../data/titanic.csv') titanic.rename(columns={'Survived': 'class'}, inplace=True) for category in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']: print("Number of levels in category '{0}': \b {1:2.2f} ".format( category, titanic[category].unique().size)) # Encode values titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1}) titanic['Embarked'] = titanic['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}) # Fill na titanic = titanic.fillna(-999) pd.isnull(titanic).any() # Encode values from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values]) # Drop unused columns titanic_new = titanic.drop( ['PassengerId', 'Name', 'Ticket', 'Cabin', 'class'], axis=1) # Create numpy arrays titanic_new = np.hstack((titanic_new.values, CabinTrans)) titanic_class = titanic['class'].values # Train test split # https://www.kdnuggets.com/2020/07/easy-guide-data-preprocessing-python.html # https://stackoverflow.com/questions/55525195/do-i-have-to-do-one-hot-encoding-separately-for-train-and-test-dataset training_indices, validation_indices = training_indices, testing_indices = train_test_split( titanic.index, stratify=titanic_class, train_size=0.75, test_size=0.25) training_indices.size, validation_indices.size # Train model tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=40) tpot.fit(titanic_new[training_indices], titanic_class[training_indices]) # Score tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, 'class'].values) # Export tpot.export('automodel.py')
def test_invaild_dataset_warning(): """Assert that the TPOT fit function raises a ValueError when dataset is not in right format""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0) bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes try: tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy assert False except ValueError: pass
def _optimizeModel(self, X, y, model_path, config): if not os.path.exists(model_path): optimizer = TPOTClassifier(verbosity=2, config_dict=config) optimizer.fit(X, y) pipeline = optimizer.fitted_pipeline_ pickle.dump(optimizer.fitted_pipeline_, open(model_path, 'wb')) else: pipeline = pickle.load(open(model_path, 'rb')) return pipeline
def generate_tpot_model(self, train_df, test_df, generations, population, use_dask=True): client = Client(workers=6, threads=2) X_train, y_train, X_test, y_test = train_test_split(train_df, test_df, random_state=42, test_size=0.25) tp = TPOTClassifier(generations=generations, population_size=population, use_dask=use_dask, verbosity=2, n_jobs=-1) tp.fit(X_train, y_train)
def tpot_train(project, X, y, export_file, prediction_type, train_size=0.75, max_time_mins=1, max_eval_time_mins=0.04, population_size=40, scoring_func=None, n_jobs=1): print( "==========train / test split for training size {}".format(train_size)) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size) print(X_train.shape, y_train.shape) print("==========Start training the model...") print("==========max_time_mins: {}".format(max_time_mins)) print("==========max_eval_time_mins: {}".format(max_eval_time_mins)) print("==========population_size: {}".format(population_size)) print("==========n_jobs: {}".format(n_jobs)) # predition type: # - regression # - classification if (prediction_type == "classification"): tpot = TPOTClassifier(verbosity=2, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, population_size=population_size, scoring=scoring_func, n_jobs=n_jobs) else: tpot = TPOTRegressor(verbosity=2, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, population_size=population_size, scoring=scoring_func, n_jobs=n_jobs, warm_start=True) tpot.fit(X_train, y_train) try: holdout_score = tpot.score(X_test, y_test) print("==========holdout set score is {}".format(holdout_score)) except: print("==========Unexpected error when score holdout set") print("==========export tpot to {}".format(export_file)) tpot.export(export_file) return tpot
def run_automl_tpot(x_train, y_train, gener=3, verb=2, popul=8, cv=None): clf = TPOTClassifier(generations=gener, verbosity=verb, population_size=popul, scoring='roc_auc', cv=cv, random_state=23) clf.fit(x_train.values, y_train) return clf
def do_tpot(generations=5, population_size=10,X='',y=''): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20) tpot = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2,cv=3) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_pipeline.py') return tpot
def test_log_file_verbose_3(): """ Set verbosity as 3. Assert log_file parameter to generate log file. """ file_name = "progress_verbosity_3.log" tracking_progress_file = open(file_name, "w") tpot_obj = TPOTClassifier(population_size=10, generations=10, verbosity=3, log_file=tracking_progress_file) tpot_obj.fit(X, y) assert_equal(os.path.getsize(file_name) > 0, True)
def test_imputer_in_export(): """Assert that TPOT exports a pipeline with an imputation step if imputation was used in fit().""" tpot_obj = TPOTClassifier( random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light' ) features_with_nan = np.copy(training_features) features_with_nan[0][0] = float('nan') tpot_obj.fit(features_with_nan, training_target) # use fixed pipeline since the random.seed() performs differently in python 2.* and 3.* pipeline_string = ( 'KNeighborsClassifier(' 'input_matrix, ' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) export_code = export_pipeline(tpot_obj._optimized_pipeline, tpot_obj.operators, tpot_obj._pset, tpot_obj._imputed) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import Imputer # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=None) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) exported_pipeline = KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(export_code, expected_code)
def test_warm_start(): """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True) tpot_obj.fit(training_features, training_classes) assert tpot_obj._pop != None assert tpot_obj._pareto_front != None first_pop = tpot_obj._pop first_pareto_front = tpot_obj._pareto_front tpot_obj.random_state = 21 tpot_obj.fit(training_features, training_classes) assert tpot_obj._pop == first_pop
def generate_model(generations, train_X, train_y): tpot_generator = TPOTClassifier(generations=generations, verbosity=2) tpot_generator.fit(train_X, train_y) tpot_generator.export('tpot_model' + generations + '.py')
def main(): # set up the path to the data sets and the data were are going to experiment # with base_path = '/scratch/ditzler/Git/ClassificationDatasets/csv/' data_setz = [#'bank', 'blood', 'breast-cancer-wisc-diag', 'breast-cancer-wisc-prog', 'breast-cancer-wisc', 'breast-cancer', 'congressional-voting', 'conn-bench-sonar-mines-rocks', 'credit-approval', 'cylinder-bands', 'echocardiogram', #'fertility', 'haberman-survival', 'heart-hungarian', 'hepatitis', 'ionosphere', 'mammographic', 'molec-biol-promoter', 'musk-1', 'oocytes_merluccius_nucleus_4d', 'oocytes_trisopterus_nucleus_2f', 'ozone', 'parkinsons', 'pima', #'pittsburg-bridges-T-OR-D'; 'planning', 'ringnorm', #'spambase', 'spectf_train', 'statlog-australian-credit', 'statlog-german-credit', 'statlog-heart', 'titanic', #'twonorm', 'vertebral-column-2clases'] # nsplits is like the number of cv (its bootstraps here) then set up some variales # to save the results to. n_splitz = 10 errors = np.zeros((len(data_setz),)) fms = np.zeros((len(data_setz),)) times = np.zeros((len(data_setz),)) m = 0 for n in range(n_splitz): print 'Spilt ' + str(n) + ' of ' + str(n_splitz) for i in range(len(data_setz)): print ' ' + data_setz[i] df = pd.read_csv(base_path + data_setz[i] + '.csv', sep=',') data = df.as_matrix() X = data[:, :-1] y = data[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=m) m += 1 ts = time.time() tpot = TPOTClassifier(generations=10, population_size=25, verbosity=1) tpot.fit(X_train, y_train) times[i] += (time.time() - ts) errors[i] += (1-tpot.score(X_test, y_test)) yhat = tpot.predict(X_test) fms[i] += f1_score(y_test, yhat, average='macro') errors /= n_splitz fms /= n_splitz times /= n_splitz df = pd.DataFrame({'errors': errors, 'fms': fms, 'times': times}) df.to_csv(path_or_buf='tpot-results2.csv', sep=',') return None
from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size = 0.75, test_size = 0.25) tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py')